diff --git a/.vs/yolo_cpu/v14/.suo b/.vs/yolo_cpu/v14/.suo
index 53d8e43..a289d8d 100644
Binary files a/.vs/yolo_cpu/v14/.suo and b/.vs/yolo_cpu/v14/.suo differ
diff --git a/bin/coco.data b/bin/coco.data
deleted file mode 100644
index c8fb232..0000000
--- a/bin/coco.data
+++ /dev/null
@@ -1,8 +0,0 @@
-classes= 80
-train  = data/coco/trainvalno5k.txt
-valid  = data/5k.txt
-#valid = data/coco_val_5k.list
-names = coco.names
-backup = backup/
-eval=coco
-
diff --git a/bin/coco.names b/bin/coco.names
deleted file mode 100644
index ca76c80..0000000
--- a/bin/coco.names
+++ /dev/null
@@ -1,80 +0,0 @@
-person
-bicycle
-car
-motorbike
-aeroplane
-bus
-train
-truck
-boat
-traffic light
-fire hydrant
-stop sign
-parking meter
-bench
-bird
-cat
-dog
-horse
-sheep
-cow
-elephant
-bear
-zebra
-giraffe
-backpack
-umbrella
-handbag
-tie
-suitcase
-frisbee
-skis
-snowboard
-sports ball
-kite
-baseball bat
-baseball glove
-skateboard
-surfboard
-tennis racket
-bottle
-wine glass
-cup
-fork
-knife
-spoon
-bowl
-banana
-apple
-sandwich
-orange
-broccoli
-carrot
-hot dog
-pizza
-donut
-cake
-chair
-sofa
-pottedplant
-bed
-diningtable
-toilet
-tvmonitor
-laptop
-mouse
-remote
-keyboard
-cell phone
-microwave
-oven
-toaster
-sink
-refrigerator
-book
-clock
-vase
-scissors
-teddy bear
-hair drier
-toothbrush
diff --git a/bin/opencv_ffmpeg340_64.dll b/bin/opencv_ffmpeg340_64.dll
new file mode 100644
index 0000000..45dc839
Binary files /dev/null and b/bin/opencv_ffmpeg340_64.dll differ
diff --git a/bin/opencv_world340.dll b/bin/opencv_world340.dll
new file mode 100644
index 0000000..8836c52
Binary files /dev/null and b/bin/opencv_world340.dll differ
diff --git a/bin/predictions.png b/bin/predictions.png
index 7cec9c8..dcb94b0 100644
Binary files a/bin/predictions.png and b/bin/predictions.png differ
diff --git a/bin/video_yolo.sh b/bin/video_yolo.sh
deleted file mode 100644
index e1f06c5..0000000
--- a/bin/video_yolo.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-./darknet detector demo coco.names yolov3.cfg yolov3.weights -thresh 0.24 test.mp4
-
-
diff --git a/bin/xnor_voc.cmd b/bin/xnor_voc.cmd
index 6b02ffe..c5dc02f 100644
--- a/bin/xnor_voc.cmd
+++ b/bin/xnor_voc.cmd
@@ -1 +1 @@
-yolo_cpu.exe detector test tiny-yolo-voc_xnor/voc.names tiny-yolo-voc_xnor/tiny_yolo_xnor.cfg tiny-yolo-voc_xnor/tiny_yolo_xnor.weights -thresh 0.15 dog.jpg
\ No newline at end of file
+yolo_cpu.exe detector test tiny-yolo-voc_xnor/voc.names tiny-yolo-voc_xnor/tiny_yolo_xnor.cfg tiny-yolo-voc_xnor/tiny_yolo_xnor.weights -thresh 0.25 dog.jpg
\ No newline at end of file
diff --git a/bin/yolo_cpu.exe b/bin/yolo_cpu.exe
index 3f38c21..de162fa 100644
Binary files a/bin/yolo_cpu.exe and b/bin/yolo_cpu.exe differ
diff --git a/bin/yolo_cpu.ilk b/bin/yolo_cpu.ilk
deleted file mode 100644
index 51b2f5e..0000000
Binary files a/bin/yolo_cpu.ilk and /dev/null differ
diff --git a/bin/yolo_cpu.iobj b/bin/yolo_cpu.iobj
index 48710c7..fb64733 100644
Binary files a/bin/yolo_cpu.iobj and b/bin/yolo_cpu.iobj differ
diff --git a/bin/yolo_cpu.ipdb b/bin/yolo_cpu.ipdb
index fd276c3..7e1d059 100644
Binary files a/bin/yolo_cpu.ipdb and b/bin/yolo_cpu.ipdb differ
diff --git a/bin/yolo_cpu.pdb b/bin/yolo_cpu.pdb
index c0a0f6e..65e738f 100644
Binary files a/bin/yolo_cpu.pdb and b/bin/yolo_cpu.pdb differ
diff --git a/bin/yolo_cpu_demo.cmd b/bin/yolo_cpu_demo.cmd
deleted file mode 100644
index e0ea40d..0000000
--- a/bin/yolo_cpu_demo.cmd
+++ /dev/null
@@ -1,6 +0,0 @@
-
-
-yolo_cpu.exe detector demo coco.names yolov3.cfg yolov3.weights -thresh 0.24 test.mp4
-
-
-pause
\ No newline at end of file
diff --git a/src/additionally.c b/src/additionally.c
index c123a2c..16253a0 100644
--- a/src/additionally.c
+++ b/src/additionally.c
@@ -1,14 +1,6 @@
 #include "additionally.h"
 #include "gpu.h"
 
-#ifdef OPENCL
-#include "ocl.h"
-#endif
-
-#ifdef CUDNN
-#pragma comment(lib, "cudnn.lib")
-#endif
-
 #ifdef _OPENMP
 #include <omp.h>
 #endif
@@ -89,17 +81,6 @@ void yolov2_fuse_conv_batchnorm(network net)
                 }
 
                 l->batch_normalize = 0;
-#ifdef GPU
-                if (gpu_index >= 0) {
-                    push_convolutional_layer(*l);
-                }
-#endif
-
-#ifdef OPENCL
-                //if (gpu_index >= 0) {
-                ocl_push_convolutional_layer(*l);
-                //}
-#endif
             }
         }
         else {
@@ -219,25 +200,6 @@ void binary_align_weights(convolutional_layer *l)
     l->mean_arr = calloc(l->n, sizeof(float));
     get_mean_array(align_weights, align_weights_size, l->n, l->mean_arr);
 
-#ifdef GPU
-    cudaError_t status;
-    l->align_workspace_size = l->bit_align * l->size * l->size * l->c;
-    status = cudaMalloc((void **)&l->align_workspace_gpu, l->align_workspace_size * sizeof(float));
-    status = cudaMalloc((void **)&l->transposed_align_workspace_gpu, l->align_workspace_size * sizeof(float));
-    check_error(status);
-
-    //l->align_bit_weights_gpu = cuda_make_array(l->align_bit_weights, l->align_bit_weights_size * sizeof(char)/sizeof(float));
-    status = cudaMalloc((void **)&l->align_bit_weights_gpu, l->align_bit_weights_size);
-    check_error(status);
-    status = cudaMemcpy(l->align_bit_weights_gpu, l->align_bit_weights, l->align_bit_weights_size, cudaMemcpyHostToDevice);
-    check_error(status);
-    status = cudaMemcpy(l->binary_weights_gpu, l->binary_weights, m*k * sizeof(float), cudaMemcpyHostToDevice);
-    check_error(status);
-
-    l->mean_arr_gpu = cuda_make_array(l->mean_arr, l->n);
-    cudaDeviceSynchronize();
-#endif // GPU
-
     free(align_weights);
 }
 
@@ -279,72 +241,6 @@ static inline unsigned char get_bit(unsigned char const*const src, size_t index)
     return val;
 }
 
-/*
-static inline unsigned char reverse_byte_1(char a)
-{
-    return ((a & 0x1) << 7) | ((a & 0x2) << 5) |
-        ((a & 0x4) << 3) | ((a & 0x8) << 1) |
-        ((a & 0x10) >> 1) | ((a & 0x20) >> 3) |
-        ((a & 0x40) >> 5) | ((a & 0x80) >> 7);
-}
-
-static inline unsigned char reverse_byte(unsigned char a)
-{
-    return ((a * 0x0802LU & 0x22110LU) | (a * 0x8020LU & 0x88440LU)) * 0x10101LU >> 16;
-}
-
-static unsigned char lookup[16] = {
-    0x0, 0x8, 0x4, 0xc, 0x2, 0xa, 0x6, 0xe,
-    0x1, 0x9, 0x5, 0xd, 0x3, 0xb, 0x7, 0xf, };
-
-static inline unsigned char reverse_byte_3(unsigned char n) {
-    // Reverse the top and bottom nibble then swap them.
-    return (lookup[n & 0b1111] << 4) | lookup[n >> 4];
-}
-
-
-static inline void transpose8rS32_reversed_diagonale(unsigned char* A, int m, int n, unsigned char* B)
-{
-    unsigned x, y, t;
-
-    // Load the array and pack it into x and y.
-    x = (A[0] << 24) | (A[m] << 16) | (A[2 * m] << 8) | A[3 * m];
-    y = (A[4 * m] << 24) | (A[5 * m] << 16) | (A[6 * m] << 8) | A[7 * m];
-
-    t = (x ^ (x >> 7)) & 0x00AA00AA;  x = x ^ t ^ (t << 7);
-    t = (y ^ (y >> 7)) & 0x00AA00AA;  y = y ^ t ^ (t << 7);
-
-    t = (x ^ (x >> 14)) & 0x0000CCCC;  x = x ^ t ^ (t << 14);
-    t = (y ^ (y >> 14)) & 0x0000CCCC;  y = y ^ t ^ (t << 14);
-
-    t = (x & 0xF0F0F0F0) | ((y >> 4) & 0x0F0F0F0F);
-    y = ((x << 4) & 0xF0F0F0F0) | (y & 0x0F0F0F0F);
-    x = t;
-
-    B[7 * n] = reverse_byte(x >> 24);  B[6 * n] = reverse_byte(x >> 16);  B[5 * n] = reverse_byte(x >> 8);  B[4 * n] = reverse_byte(x);
-    B[3 * n] = reverse_byte(y >> 24);  B[2 * n] = reverse_byte(y >> 16);  B[1 * n] = reverse_byte(y >> 8);  B[0 * n] = reverse_byte(y);
-}
-
-void transpose_bin(char *A, char *B, const int n, const int m,
-    const int lda, const int ldb, const int block_size)
-{
-    int i;
-    #pragma omp parallel for
-    for (i = 0; i < n; i += 8) {
-        int j;
-        for (j = 0; j < m - 8; j += 8) {
-            int a_index = i*lda + j;
-            int b_index = j*ldb + i;
-            //transpose_8x8_bits_my(&A[a_index/8], &B[b_index/8], lda/8, ldb/8);
-            transpose8rS32_reversed_diagonale(&A[a_index / 8], lda / 8, ldb / 8, &B[b_index / 8]);
-        }
-        for (; j < m; ++j) {
-            if (get_bit(A, i*lda + j)) set_bit(B, j*ldb + i);
-        }
-    }
-}
-*/
-
 uint8_t reverse_8_bit(uint8_t a) {
     return ((a * 0x0802LU & 0x22110LU) | (a * 0x8020LU & 0x88440LU)) * 0x10101LU >> 16;
 }
@@ -434,563 +330,6 @@ void transpose_bin(uint32_t *A, uint32_t *B, const int n, const int m,
 
 // -------------- blas.c --------------
 
-
-#ifdef AVX
-
-#ifdef _WIN64
-// Windows
-#include <intrin.h>
-#else
-// Linux
-#include <x86intrin.h>
-#endif
-
-#include <ammintrin.h>
-#include <immintrin.h>
-#include <smmintrin.h>
-#include <emmintrin.h>
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=broad&expand=561
-
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide
-void gemm_nn(int M, int N, int K, float ALPHA,
-    float *A, int lda,
-    float *B, int ldb,
-    float *C, int ldc)
-{
-    int i, j, k;
-    for (i = 0; i < M; ++i) {
-        for (k = 0; k < K; ++k) {
-            float A_PART = ALPHA*A[i*lda + k];
-            __m256 a256, b256, c256, result256;    // AVX
-            a256 = _mm256_set1_ps(A_PART);
-            for (j = 0; j < N - 8; j += 8) {
-                b256 = _mm256_loadu_ps(&B[k*ldb + j]);
-                c256 = _mm256_loadu_ps(&C[i*ldc + j]);
-                // FMA - Intel Haswell (2013), AMD Piledriver (2012)
-                //result256 = _mm256_fmadd_ps(a256, b256, c256);
-                result256 = _mm256_mul_ps(a256, b256);
-                result256 = _mm256_add_ps(result256, c256);
-                _mm256_storeu_ps(&C[i*ldc + j], result256);
-            }
-
-            int prev_end = (N % 8 == 0) ? (N - 8) : (N / 8) * 8;
-            for (j = prev_end; j < N; ++j)
-                C[i*ldc + j] += A_PART*B[k*ldb + j];
-        }
-    }
-}
-
-
-#if defined(_MSC_VER) && _MSC_VER <= 1900
-static inline __int32 _mm256_extract_epi64(__m256i a, const int index) {
-    return a.m256i_i64[index];
-}
-
-static inline __int32 _mm256_extract_epi32(__m256i a, const int index) {
-    return a.m256i_i32[index];
-}
-#endif
-
-static inline float _castu32_f32(uint32_t a) {
-    return *((float *)&a);
-}
-
-#if defined(_MSC_VER)
-// Windows
-static inline float _mm256_extract_float32(__m256 a, const int index) {
-    return a.m256_f32[index];
-}
-#else
-// Linux
-static inline float _mm256_extract_float32(__m256 a, const int index) {
-    return _castu32_f32(_mm256_extract_epi32(_mm256_castps_si256(a), index));
-}
-#endif
-
-//From Berkeley Vision's Caffe!
-//https://github.com/BVLC/caffe/blob/master/LICENSE
-void im2col_cpu_custom(float* data_im,
-    int channels, int height, int width,
-    int ksize, int stride, int pad, float* data_col)
-{
-
-    int c;
-    const int height_col = (height + 2 * pad - ksize) / stride + 1;
-    const int width_col = (width + 2 * pad - ksize) / stride + 1;
-    const int channels_col = channels * ksize * ksize;
-
-    // optimized version
-    if (height_col == height && width_col == width && stride == 1 && pad == 1)// && is_fma_avx())
-    {
-        #pragma omp parallel for
-        for (c = 0; c < channels_col; ++c) {
-            int h, w;
-            int w_offset = c % ksize;
-            int h_offset = (c / ksize) % ksize;
-            int c_im = c / ksize / ksize;
-            for (h = pad; h < height_col - pad; ++h) {
-                for (w = pad; w < width_col - pad - 8; w += 8) {
-                    int im_row = h_offset + h - pad;
-                    int im_col = w_offset + w - pad;
-                    int col_index = (c * height_col + h) * width_col + w;
-
-                    //data_col[col_index] = data_im[im_col + width*(im_row + height*c_im)];
-                    __m256 src256 = _mm256_loadu_ps((float *)(&data_im[im_col + width*(im_row + height*c_im)]));
-                    _mm256_storeu_ps(&data_col[col_index], src256);
-                }
-
-                for (; w < width_col - pad; ++w) {
-                    int im_row = h_offset + h - pad;
-                    int im_col = w_offset + w - pad;
-                    int col_index = (c * height_col + h) * width_col + w;
-
-                    data_col[col_index] = data_im[im_col + width*(im_row + height*c_im)];
-                }
-            }
-
-            {
-                w = 0;
-                for (h = 0; h < height_col; ++h) {
-                    int im_row = h_offset + h;
-                    int im_col = w_offset + w;
-                    int col_index = (c * height_col + h) * width_col + w;
-                    data_col[col_index] = im2col_get_pixel(data_im, height, width, channels,
-                        im_row, im_col, c_im, pad);
-                }
-            }
-
-            {
-                w = width_col - 1;
-                for (h = 0; h < height_col; ++h) {
-                    int im_row = h_offset + h;
-                    int im_col = w_offset + w;
-                    int col_index = (c * height_col + h) * width_col + w;
-                    data_col[col_index] = im2col_get_pixel(data_im, height, width, channels,
-                        im_row, im_col, c_im, pad);
-                }
-            }
-
-            {
-                h = 0;
-                for (w = 0; w < width_col; ++w) {
-                    int im_row = h_offset + h;
-                    int im_col = w_offset + w;
-                    int col_index = (c * height_col + h) * width_col + w;
-                    data_col[col_index] = im2col_get_pixel(data_im, height, width, channels,
-                        im_row, im_col, c_im, pad);
-                }
-            }
-
-            {
-                h = height_col - 1;
-                for (w = 0; w < width_col; ++w) {
-                    int im_row = h_offset + h;
-                    int im_col = w_offset + w;
-                    int col_index = (c * height_col + h) * width_col + w;
-                    data_col[col_index] = im2col_get_pixel(data_im, height, width, channels,
-                        im_row, im_col, c_im, pad);
-                }
-            }
-        }
-
-    }
-    else {
-        //printf("\n Error: is no non-optimized version \n");
-        im2col_cpu(data_im, channels, height, width, ksize, stride, pad, data_col);
-    }
-}
-
-//From Berkeley Vision's Caffe!
-//https://github.com/BVLC/caffe/blob/master/LICENSE
-void im2col_cpu_custom_bin(float* data_im,
-    int channels, int height, int width,
-    int ksize, int stride, int pad, float* data_col, int bit_align)
-{
-    int c;
-    const int height_col = (height + 2 * pad - ksize) / stride + 1;
-    const int width_col = (width + 2 * pad - ksize) / stride + 1;
-    const int channels_col = channels * ksize * ksize;
-
-    // optimized version
-    if (height_col == height && width_col == width && stride == 1 && pad == 1)
-    {
-        //__m256i all256_sing1 = _mm256_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);
-        __m256 float_zero256 = _mm256_set1_ps(0.00);
-
-        int new_ldb = bit_align;
-
-        #pragma omp parallel for
-        for (c = 0; c < channels_col; ++c) {
-            int h, w;
-            int w_offset = c % ksize;
-            int h_offset = (c / ksize) % ksize;
-            int c_im = c / ksize / ksize;
-            for (h = pad; h < height_col - pad; ++h) {
-                for (w = pad; w < width_col - pad - 8; w += 8) {
-                    int im_row = h_offset + h - pad;
-                    int im_col = w_offset + w - pad;
-                    //int col_index = (c * height_col + h) * width_col + w;
-                    int col_index = c * new_ldb + h * width_col + w;
-
-                    //__m256i src256 = _mm256_loadu_si256((__m256i *)(&data_im[im_col + width*(im_row + height*c_im)]));
-                    //__m256i result256 = _mm256_and_si256(src256, all256_sing1); // check sign in 8 x 32-bit floats
-                    //uint16_t mask = _mm256_movemask_ps(_mm256_castsi256_ps(result256)); // (val >= 0) ? 0 : 1
-                    //mask = ~mask;   // inverse mask,  (val >= 0) ? 1 : 0
-
-                    __m256 src256 = _mm256_loadu_ps((float *)(&data_im[im_col + width*(im_row + height*c_im)]));
-                    __m256 result256 = _mm256_cmp_ps(src256, float_zero256, _CMP_GT_OS);
-                    uint16_t mask = _mm256_movemask_ps(result256); // (val > 0) ? 0 : 1
-
-                    uint16_t *dst_ptr = &((unsigned char*)data_col)[col_index / 8];
-                    *dst_ptr |= (mask << (col_index % 8));
-                }
-
-                for (; w < width_col - pad; ++w) {
-                    int im_row = h_offset + h - pad;
-                    int im_col = w_offset + w - pad;
-                    //int col_index = (c * height_col + h) * width_col + w;
-                    int col_index = c * new_ldb + h * width_col + w;
-
-                    //data_col[col_index] = data_im[im_col + width*(im_row + height*c_im)];
-                    float val = data_im[im_col + width*(im_row + height*c_im)];
-                    if (val > 0) set_bit(data_col, col_index);
-                }
-            }
-
-            {
-                w = 0;
-                for (h = 0; h < height_col; ++h) {
-                    int im_row = h_offset + h;
-                    int im_col = w_offset + w;
-                    //int col_index = (c * height_col + h) * width_col + w;
-                    int col_index = c * new_ldb + h * width_col + w;
-
-                    //data_col[col_index] = im2col_get_pixel(data_im, height, width, channels, im_row, im_col, c_im, pad);
-                    float val = im2col_get_pixel(data_im, height, width, channels, im_row, im_col, c_im, pad);
-                    if (val > 0) set_bit(data_col, col_index);
-                }
-            }
-
-            {
-                w = width_col - 1;
-                for (h = 0; h < height_col; ++h) {
-                    int im_row = h_offset + h;
-                    int im_col = w_offset + w;
-                    //int col_index = (c * height_col + h) * width_col + w;
-                    int col_index = c * new_ldb + h * width_col + w;
-
-                    //data_col[col_index] = im2col_get_pixel(data_im, height, width, channels, im_row, im_col, c_im, pad);
-                    float val = im2col_get_pixel(data_im, height, width, channels, im_row, im_col, c_im, pad);
-                    if (val > 0) set_bit(data_col, col_index);
-                }
-            }
-
-            {
-                h = 0;
-                for (w = 0; w < width_col; ++w) {
-                    int im_row = h_offset + h;
-                    int im_col = w_offset + w;
-                    //int col_index = (c * height_col + h) * width_col + w;
-                    int col_index = c * new_ldb + h * width_col + w;
-
-                    //data_col[col_index] = im2col_get_pixel(data_im, height, width, channels, im_row, im_col, c_im, pad);
-                    float val = im2col_get_pixel(data_im, height, width, channels, im_row, im_col, c_im, pad);
-                    if (val > 0) set_bit(data_col, col_index);
-                }
-            }
-
-            {
-                h = height_col - 1;
-                for (w = 0; w < width_col; ++w) {
-                    int im_row = h_offset + h;
-                    int im_col = w_offset + w;
-                    //int col_index = (c * height_col + h) * width_col + w;
-                    int col_index = c * new_ldb + h * width_col + w;
-
-                    //data_col[col_index] = im2col_get_pixel(data_im, height, width, channels, im_row, im_col, c_im, pad);
-                    float val = im2col_get_pixel(data_im, height, width, channels, im_row, im_col, c_im, pad);
-                    if (val > 0) set_bit(data_col, col_index);
-                }
-            }
-        }
-
-    }
-    else {
-        printf("\n Error: is no non-optimized version \n");
-        //im2col_cpu(data_im, channels, height, width, ksize, stride, pad, data_col); // must be aligned for transpose after float_to_bin
-        // float_to_bit(b, t_input, src_size);
-        // transpose_bin(t_input, *t_bit_input, k, n, bit_align, new_ldb, 8);
-    }
-}
-
-void activate_array_cpu_custom(float *x, const int n, const ACTIVATION a)
-{
-    int i = 0;
-    if (a == LINEAR)
-    {
-    }
-    else if (a == LEAKY)
-    {
-        {
-            __m256i all256_sing1 = _mm256_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);
-            __m256 all256_01 = _mm256_set1_ps(0.1F);
-
-            for (i = 0; i < n - 8; i += 8) {
-                //x[i] = (x[i]>0) ? x[i] : .1*x[i];
-
-                __m256 src256 = _mm256_loadu_ps(&x[i]);
-                __m256 mult256 = _mm256_mul_ps((src256), all256_01); // mult * 0.1
-
-                __m256i sign256 = _mm256_and_si256(_mm256_castps_si256(src256), all256_sing1); // check sign in 8 x 32-bit floats
-
-                __m256 result256 = _mm256_blendv_ps(src256, mult256, _mm256_castsi256_ps(sign256)); // (sign>0) ? src : mult;
-                _mm256_storeu_ps(&x[i], result256);
-            }
-        }
-
-        for (; i < n; ++i) {
-            x[i] = (x[i]>0) ? x[i] : .1*x[i];
-        }
-    }
-    else {
-        for (i = 0; i < n; ++i) {
-            x[i] = activate(x[i], a);
-        }
-    }
-}
-
-
-void forward_maxpool_layer_avx(float *src, float *dst, int *indexes, int size, int w, int h, int out_w, int out_h, int c,
-    int pad, int stride, int batch)
-{
-
-    const int w_offset = -pad / 2;
-    const int h_offset = -pad / 2;
-    int b, k;
-
-    for (b = 0; b < batch; ++b) {
-        #pragma omp parallel for
-        for (k = 0; k < c; ++k) {
-            int i, j, m, n;
-            for (i = 0; i < out_h; ++i) {
-                //for (j = 0; j < out_w; ++j) {
-                j = 0;
-
-                if (stride == 1) {
-                    for (j = 0; j < out_w - 8 - (size - 1); j += 8) {
-                        int out_index = j + out_w*(i + out_h*(k + c*b));
-                        __m256 max256 = _mm256_set1_ps(-FLT_MAX);
-                        for (n = 0; n < size; ++n) {
-                            for (m = 0; m < size; ++m) {
-                                int cur_h = h_offset + i*stride + n;
-                                int cur_w = w_offset + j*stride + m;
-                                int index = cur_w + w*(cur_h + h*(k + b*c));
-                                int valid = (cur_h >= 0 && cur_h < h &&
-                                    cur_w >= 0 && cur_w < w);
-                                if (!valid) continue;
-
-                                __m256 src256 = _mm256_loadu_ps(&src[index]);
-                                max256 = _mm256_max_ps(src256, max256);
-                            }
-                        }
-                        _mm256_storeu_ps(&dst[out_index], max256);
-
-                    }
-                }
-                else if (size == 2 && stride == 2) {
-                    for (j = 0; j < out_w - 4; j += 4) {
-                        int out_index = j + out_w*(i + out_h*(k + c*b));
-                        float max = -FLT_MAX;
-                        int max_i = -1;
-                        __m128 max128 = _mm_set1_ps(-FLT_MAX);
-
-                        for (n = 0; n < size; ++n) {
-                            //for (m = 0; m < size; ++m)
-                            m = 0;
-                            {
-                                int cur_h = h_offset + i*stride + n;
-                                int cur_w = w_offset + j*stride + m;
-                                int index = cur_w + w*(cur_h + h*(k + b*c));
-                                int valid = (cur_h >= 0 && cur_h < h &&
-                                    cur_w >= 0 && cur_w < w);
-                                if (!valid) continue;
-
-                                __m256 src256 = _mm256_loadu_ps(&src[index]);
-                                __m256 src256_2 = _mm256_permute_ps(src256, (1 << 0) | (3 << 4));
-                                __m256 max256 = _mm256_max_ps(src256, src256_2);
-
-                                __m128 src128_0 = _mm256_extractf128_ps(max256, 0);
-                                __m128 src128_1 = _mm256_extractf128_ps(max256, 1);
-                                __m128 src128 = _mm_shuffle_ps(src128_0, src128_1, (2 << 2) | (2 << 6));
-
-                                max128 = _mm_max_ps(src128, max128);
-                            }
-                        }
-                        _mm_storeu_ps(&dst[out_index], max128);
-                    }
-                }
-
-                for (; j < out_w; ++j) {
-                    int out_index = j + out_w*(i + out_h*(k + c*b));
-                    float max = -FLT_MAX;
-                    int max_i = -1;
-                    for (n = 0; n < size; ++n) {
-                        for (m = 0; m < size; ++m) {
-                            int cur_h = h_offset + i*stride + n;
-                            int cur_w = w_offset + j*stride + m;
-                            int index = cur_w + w*(cur_h + h*(k + b*c));
-                            int valid = (cur_h >= 0 && cur_h < h &&
-                                cur_w >= 0 && cur_w < w);
-                            float val = (valid != 0) ? src[index] : -FLT_MAX;
-                            max_i = (val > max) ? index : max_i;
-                            max = (val > max) ? val : max;
-                        }
-                    }
-                    dst[out_index] = max;
-                    indexes[out_index] = max_i;
-                }
-            }
-        }
-    }
-}
-
-
-// http://graphics.stanford.edu/~seander/bithacks.html
-// https://stackoverflow.com/questions/17354971/fast-counting-the-number-of-set-bits-in-m128i-register
-// https://arxiv.org/pdf/1611.07612.pdf
-
-static inline int popcnt128(__m128i n) {
-    const __m128i n_hi = _mm_unpackhi_epi64(n, n);
-#ifdef _MSC_VER
-    return __popcnt64(_mm_cvtsi128_si64(n)) + __popcnt64(_mm_cvtsi128_si64(n_hi));
-#else
-    return __popcntq(_mm_cvtsi128_si64(n)) + __popcntq(_mm_cvtsi128_si64(n_hi));
-#endif
-}
-
-static inline int popcnt256(__m256i n) {
-    return popcnt128(_mm256_extractf128_si256(n, 0)) + popcnt128(_mm256_extractf128_si256(n, 1));
-}
-
-static inline __m256i count256(__m256i v) {
-    __m256i lookup =
-        _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2,
-            2, 3, 2, 3, 3, 4, 0, 1, 1, 2, 1, 2, 2, 3,
-            1, 2, 2, 3, 2, 3, 3, 4);
-
-    __m256i low_mask = _mm256_set1_epi8(0x0f);
-
-    __m256i lo = _mm256_and_si256(v, low_mask);
-    __m256i hi = _mm256_and_si256(_mm256_srli_epi32(v, 4), low_mask);
-    __m256i popcnt1 = _mm256_shuffle_epi8(lookup, lo);
-    __m256i popcnt2 = _mm256_shuffle_epi8(lookup, hi);
-    __m256i total = _mm256_add_epi8(popcnt1, popcnt2);
-
-    return _mm256_sad_epu8(total, _mm256_setzero_si256());
-}
-
-static inline int popcnt256_custom(__m256i n) {
-    __m256i val = count256(n);
-
-    //return val.m256i_i64[0] +
-    //val.m256i_i64[1] +
-    //val.m256i_i64[2] +
-    //val.m256i_i64[3];
-    return _mm256_extract_epi64(val, 0)
-        + _mm256_extract_epi64(val, 1)
-        + _mm256_extract_epi64(val, 2)
-        + _mm256_extract_epi64(val, 3);
-}
-
-// 5x times faster than gemm()-float32
-// further optimizations: do mean-mult only for the last layer
-void gemm_nn_custom_bin_mean_transposed(int M, int N, int K, float ALPHA_UNUSED,
-    unsigned char *A, int lda,
-    unsigned char *B, int ldb,
-    float *C, int ldc, float *mean_arr)
-{
-
-#if defined(_OPENMP)
-    static int max_num_threads = 0;
-    if (max_num_threads == 0) {
-        max_num_threads = omp_get_max_threads();
-        //omp_set_num_threads(max_num_threads / 2);
-    }
-#endif
-
-    int i;
-    #pragma omp parallel for
-    for (i = 0; i < M; ++i)
-    {   // l.n - filters [16 - 55 - 1024]
-        float mean_val = mean_arr[i];
-        int j, k;
-        __m256i all_1 = _mm256_set1_epi8(255);
-
-        for (j = 0; j < N; ++j) { // out_h*out_w - one channel output size [169 - 173056]
-            int count = 0;
-            const int bit_step = 256;
-            __m256i count_sum = _mm256_set1_epi8(0);
-
-            for (k = 0; k < K; k += bit_step) {   // l.size*l.size*l.c - one filter size [27 - 9216]
-                __m256i a_bit256 = _mm256_loadu_si256((__m256i *)(A + (i*lda + k) / 8));
-                __m256i b_bit256 = _mm256_loadu_si256((__m256i *)(B + (j*ldb + k) / 8));
-                __m256i xor256 = _mm256_xor_si256(a_bit256, b_bit256);  // xnor = not(xor(a,b))
-                __m256i c_bit256 = _mm256_andnot_si256(xor256, all_1);  // can be optimized - we can do other NOT for wegihts once and do not do this NOT
-
-                count_sum = _mm256_add_epi64(count256(c_bit256), count_sum);    //  Mula�s algorithm
-
-                                                                                //count += popcnt256(c_bit256);
-
-                                                                                //binary_int64_printf(c_bit64);
-                                                                                //printf(", count = %d \n\n", tmp_count);
-            }
-
-            // count of 1 bits
-            //count = count_sum.m256i_i64[0] +
-            //    count_sum.m256i_i64[1] +
-            //    count_sum.m256i_i64[2] +
-            //   count_sum.m256i_i64[3];
-            count = _mm256_extract_epi64(count_sum, 0)
-                + _mm256_extract_epi64(count_sum, 1)
-                + _mm256_extract_epi64(count_sum, 2)
-                + _mm256_extract_epi64(count_sum, 3);
-
-            int f1 = (K % bit_step == 0) ? 0 : (bit_step - (K % bit_step));
-            count = count - f1;    // remove extra bits (from empty space for align only)
-
-            C[i*ldc + j] = (2 * count - K) * mean_val;
-        }
-    }
-}
-
-
-
-void float_to_bit(float *src, unsigned char *dst, size_t size)
-{
-    size_t dst_size = size / 8 + 1;
-    memset(dst, 0, dst_size);
-
-    size_t i;
-    //__m256i all256_sing1 = _mm256_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);
-    __m256 float_zero256 = _mm256_set1_ps(0.0);
-
-    for (i = 0; i < size; i += 8)
-    {
-        //__m256i src256 = _mm256_loadu_si256((__m256i *)(&src[i]));
-        //__m256i result256 = _mm256_and_si256(src256, all256_sing1); // check sign in 8 x 32-bit floats
-        //uint32_t mask = _mm256_movemask_ps(_mm256_castsi256_ps(result256)); // (val >= 0) ? 0 : 1
-        ////mask = ~mask;   // inverse mask,  (val >= 0) ? 1 : 0
-
-        __m256 src256 = _mm256_loadu_ps((float *)(&src[i]));
-        __m256 result256 = _mm256_cmp_ps(src256, float_zero256, _CMP_GT_OS);
-        uint32_t mask = _mm256_movemask_ps(result256); // (val > 0) ? 0 : 1
-
-        dst[i / 8] = mask;
-    }
-}
-
-#else // AVX
-
 void gemm_nn(int M, int N, int K, float ALPHA,
     float *A, int lda,
     float *B, int ldb,
@@ -1259,7 +598,6 @@ void float_to_bit(float *src, unsigned char *dst, size_t size)
     }
     free(byte_arr);
 }
-#endif    // __x86_64
 
 /*
 void gemm_nn(int M, int N, int K, float ALPHA,
@@ -1718,9 +1056,6 @@ char **get_labels(char *filename)
 // network.c
 float *get_network_output(network net)
 {
-#ifdef GPU
-    if (gpu_index >= 0) return get_network_output_gpu(net);
-#endif
     int i;
     for (i = net.n - 1; i > 0; --i) if (net.layers[i].type != COST) break;
     return net.layers[i].output;
@@ -1741,10 +1076,6 @@ network make_network(int n)
     net.n = n;
     net.layers = calloc(net.n, sizeof(layer));
     net.seen = calloc(1, sizeof(uint64_t));
-#ifdef GPU
-    net.input_gpu = calloc(1, sizeof(float *));
-    net.truth_gpu = calloc(1, sizeof(float *));
-#endif
     return net;
 }
 
@@ -1760,89 +1091,8 @@ void free_network(network net)
     free(net.steps);
     free(net.seen);
 
-#ifdef GPU
-    if (gpu_index >= 0) cuda_free(net.workspace);
-    else free(net.workspace);
-    if (net.input_state_gpu) cuda_free(net.input_state_gpu);
-    if (*net.input_gpu) cuda_free(*net.input_gpu);
-    if (*net.truth_gpu) cuda_free(*net.truth_gpu);
-    if (net.input_gpu) free(net.input_gpu);
-    if (net.truth_gpu) free(net.truth_gpu);
-
-    //if (*net.input16_gpu) cuda_free(*net.input16_gpu);
-    //if (*net.output16_gpu) cuda_free(*net.output16_gpu);
-    //if (net.input16_gpu) free(net.input16_gpu);
-    //if (net.output16_gpu) free(net.output16_gpu);
-    //if (net.max_input16_size) free(net.max_input16_size);
-    //if (net.max_output16_size) free(net.max_output16_size);
-#else
     free(net.workspace);
-#endif
-}
-
-
-// network.c
-#ifdef GPU
-#ifdef CUDNN
-void cudnn_convolutional_setup(layer *l)
-{
-#if(CUDNN_MAJOR >= 7)
-    cudnnSetConvolutionMathType(l->convDesc, CUDNN_TENSOR_OP_MATH);
-#if((CUDNN_MAJOR*10 + CUDNN_MINOR) >= 72)   // cuDNN >= 7.2
-    cudnnSetConvolutionMathType(l->convDesc, CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION);
-#endif  //(CUDNN_MAJOR >= 7.2)
-#endif  //(CUDNN_MAJOR >= 7)
-
-    if (l->quantized)
-    {
-        cudnnDataType_t cudnn_data_type = CUDNN_DATA_INT8x4;
-        cudnnTensorFormat_t tensor_format = CUDNN_TENSOR_NCHW_VECT_C;
-        cudnnTensorFormat_t dst_tensor_format = CUDNN_TENSOR_NCHW;
-
-#if((CUDNN_MAJOR*10 + CUDNN_MINOR) >= 72)
-        //if (l->c % 32 == 0) cudnn_data_type = CUDNN_DATA_INT8x32;   // Tensor Cores for INT8
-#endif  //(CUDNN_MAJOR >= 7.2)
-
-        cudnnSetTensor4dDescriptor(l->srcTensorDesc, CUDNN_TENSOR_NCHW_VECT_C, cudnn_data_type, l->batch, l->c, l->h, l->w);
-        cudnnSetFilter4dDescriptor(l->weightDesc, cudnn_data_type, CUDNN_TENSOR_NCHW_VECT_C, l->n, l->c, l->size, l->size);
-        cudnnSetTensor4dDescriptor(l->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w);
-        cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_INT32);    // cudnn 7
-
-        l->fw_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; // only supported for DP4A INT8x4
-
-                                                                       // BIAS float
-        cudnnSetTensor4dDescriptor(l->biasTensorDesc, dst_tensor_format, CUDNN_DATA_FLOAT, 1, l->out_c, 1, 1);
-
-        // https://en.wikipedia.org/wiki/Activation_function
-        // CUDNN_ACTIVATION_IDENTITY
-        cudnnSetActivationDescriptor(l->activationDesc, CUDNN_ACTIVATION_IDENTITY, CUDNN_NOT_PROPAGATE_NAN, 0);
-        //cudnnSetActivationDescriptor(activationDesc, CUDNN_ACTIVATION_RELU, CUDNN_NOT_PROPAGATE_NAN, 0.1);    // RELU or ELU can't replace LEAKY_RELU
-    }
-    else {
-        cudnnSetTensor4dDescriptor(l->srcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w);
-        cudnnSetTensor4dDescriptor(l->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w);
-        cudnnSetFilter4dDescriptor(l->weightDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size);
-#if(CUDNN_MAJOR >= 6)
-        cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT);    // cudnn 6.0
-#else
-        cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION);    // cudnn 5.1
-#endif
-        cudnnGetConvolutionForwardAlgorithm(cudnn_handle(),
-            l->srcTensorDesc,
-            l->weightDesc,
-            l->convDesc,
-            l->dstTensorDesc,
-            CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
-            0,
-            &l->fw_algo);
-
-        //l->fw_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; // un-comment to use Tensor Cores for cuDNN >= 7.2
-    }
 }
-#endif
-#endif
-
-
 
 // network.c
 void set_batch_network(network *net, int b)
@@ -1852,11 +1102,6 @@ void set_batch_network(network *net, int b)
     for (i = 0; i < net->n; ++i) {
         layer l = net->layers[i];
         l.batch = b;
-#ifdef CUDNN
-        if (l.type == CONVOLUTIONAL) {
-            cudnn_convolutional_setup(net->layers + i);
-        }
-#endif
     }
 }
 
@@ -1868,9 +1113,6 @@ void free_layer(layer l)
 {
     if (l.type == DROPOUT) {
         if (l.rand)           free(l.rand);
-#ifdef GPU
-        if (l.rand_gpu)             cuda_free(l.rand_gpu);
-#endif
         return;
     }
     if (l.cweights)           free(l.cweights);
@@ -1917,55 +1159,6 @@ void free_layer(layer l)
     if (l.r_cpu)              free(l.r_cpu);
     if (l.h_cpu)              free(l.h_cpu);
     if (l.binary_input)       free(l.binary_input);
-
-#ifdef GPU
-    if (l.indexes_gpu)           cuda_free((float *)l.indexes_gpu);
-
-    if (l.z_gpu)                   cuda_free(l.z_gpu);
-    if (l.r_gpu)                   cuda_free(l.r_gpu);
-    if (l.h_gpu)                   cuda_free(l.h_gpu);
-    if (l.m_gpu)                   cuda_free(l.m_gpu);
-    if (l.v_gpu)                   cuda_free(l.v_gpu);
-    if (l.prev_state_gpu)          cuda_free(l.prev_state_gpu);
-    if (l.forgot_state_gpu)        cuda_free(l.forgot_state_gpu);
-    if (l.forgot_delta_gpu)        cuda_free(l.forgot_delta_gpu);
-    if (l.state_gpu)               cuda_free(l.state_gpu);
-    if (l.state_delta_gpu)         cuda_free(l.state_delta_gpu);
-    if (l.gate_gpu)                cuda_free(l.gate_gpu);
-    if (l.gate_delta_gpu)          cuda_free(l.gate_delta_gpu);
-    if (l.save_gpu)                cuda_free(l.save_gpu);
-    if (l.save_delta_gpu)          cuda_free(l.save_delta_gpu);
-    if (l.concat_gpu)              cuda_free(l.concat_gpu);
-    if (l.concat_delta_gpu)        cuda_free(l.concat_delta_gpu);
-    if (l.binary_input_gpu)        cuda_free(l.binary_input_gpu);
-    if (l.binary_weights_gpu)      cuda_free(l.binary_weights_gpu);
-    if (l.mean_gpu)                cuda_free(l.mean_gpu);
-    if (l.variance_gpu)            cuda_free(l.variance_gpu);
-    if (l.rolling_mean_gpu)        cuda_free(l.rolling_mean_gpu);
-    if (l.rolling_variance_gpu)    cuda_free(l.rolling_variance_gpu);
-    if (l.variance_delta_gpu)      cuda_free(l.variance_delta_gpu);
-    if (l.mean_delta_gpu)          cuda_free(l.mean_delta_gpu);
-    if (l.x_gpu)                   cuda_free(l.x_gpu);
-    if (l.x_norm_gpu)              cuda_free(l.x_norm_gpu);
-
-    if (l.align_bit_weights_gpu)   cuda_free(l.align_bit_weights_gpu);
-    if (l.mean_arr_gpu)            cuda_free(l.mean_arr_gpu);
-    if (l.align_workspace_gpu)     cuda_free(l.align_workspace_gpu);
-    if (l.transposed_align_workspace_gpu) cuda_free(l.transposed_align_workspace_gpu);
-
-    if (l.weights_gpu)             cuda_free(l.weights_gpu);
-    //if (l.weight_updates_gpu)      cuda_free(l.weight_updates_gpu);
-    if (l.biases_gpu)              cuda_free(l.biases_gpu);
-    //if (l.bias_updates_gpu)        cuda_free(l.bias_updates_gpu);
-    if (l.scales_gpu)              cuda_free(l.scales_gpu);
-    //if (l.scale_updates_gpu)       cuda_free(l.scale_updates_gpu);
-    if (l.output_gpu)              cuda_free(l.output_gpu);
-    if (l.output_gpu_int8)         cuda_free(l.output_gpu_int8);
-    if (l.delta_gpu)               cuda_free(l.delta_gpu);
-    if (l.rand_gpu)                cuda_free(l.rand_gpu);
-    if (l.squared_gpu)             cuda_free(l.squared_gpu);
-    if (l.norms_gpu)               cuda_free(l.norms_gpu);
-#endif
 }
 
 
@@ -1988,21 +1181,6 @@ softmax_layer make_softmax_layer(int batch, int inputs, int groups)
     // commented only for this custom version of Yolo v2
     //l.forward = forward_softmax_layer;
     //l.backward = backward_softmax_layer;
-#ifdef GPU
-    // commented only for this custom version of Yolo v2
-    //l.forward_gpu = forward_softmax_layer_gpu;
-    //l.backward_gpu = backward_softmax_layer_gpu;
-
-    l.output_gpu = cuda_make_array(l.output, inputs*batch);
-    cudaError_t status;
-    status = cudaMalloc((void **)&(l.output_gpu_int8), sizeof(int8_t)*inputs*batch);
-    //l.delta_gpu = cuda_make_array(l.delta, inputs*batch);
-#endif
-
-#ifdef OPENCL
-    l.output_ocl = ocl_make_array(l.output, inputs*batch);
-#endif
-
     return l;
 }
 
@@ -2033,11 +1211,6 @@ layer make_upsample_layer(int batch, int w, int h, int c, int stride)
     l.output = calloc(l.outputs*batch, sizeof(float));;
 
     //l.forward = forward_upsample_layer;
-#ifdef GPU
-    //l.forward_gpu = forward_upsample_layer_gpu;
-
-    l.output_gpu = cuda_make_array(l.output, l.outputs*batch);
-#endif
     if (l.reverse) fprintf(stderr, "downsample         %2dx  %4d x%4d x%4d   ->  %4d x%4d x%4d\n", stride, w, h, c, l.out_w, l.out_h, l.out_c);
     else fprintf(stderr, "upsample           %2dx  %4d x%4d x%4d   ->  %4d x%4d x%4d\n", stride, w, h, c, l.out_w, l.out_h, l.out_c);
     return l;
@@ -2063,10 +1236,7 @@ layer make_shortcut_layer(int batch, int index, int w, int h, int c, int w2, int
 
     l.index = index;
 
-    l.output = calloc(l.outputs*batch, sizeof(float));;
-#ifdef GPU
-    l.output_gpu = cuda_make_array(l.output, l.outputs*batch);
-#endif
+    l.output = calloc(l.outputs*batch, sizeof(float));
     return l;
 }
 
@@ -2104,20 +1274,6 @@ layer make_reorg_layer(int batch, int w, int h, int c, int stride, int reverse)
     // commented only for this custom version of Yolo v2
     //l.forward = forward_reorg_layer;
     //l.backward = backward_reorg_layer;
-#ifdef GPU
-    // commented only for this custom version of Yolo v2
-    //l.forward_gpu = forward_reorg_layer_gpu;
-    //l.backward_gpu = backward_reorg_layer_gpu;
-
-    l.output_gpu = cuda_make_array(l.output, output_size);
-    cudaError_t status;
-    status = cudaMalloc((void **)&(l.output_gpu_int8), sizeof(int8_t)*output_size);
-    //l.delta_gpu = cuda_make_array(l.delta, output_size);
-#endif
-
-#ifdef OPENCL
-    l.output_ocl = ocl_make_array(l.output, output_size);
-#endif
     return l;
 }
 
@@ -2149,20 +1305,6 @@ route_layer make_route_layer(int batch, int n, int *input_layers, int *input_siz
     // commented only for this custom version of Yolo v2
     //l.forward = forward_route_layer;
     //l.backward = backward_route_layer;
-#ifdef GPU
-    // commented only for this custom version of Yolo v2
-    //l.forward_gpu = forward_route_layer_gpu;
-    //l.backward_gpu = backward_route_layer_gpu;
-
-    //l.delta_gpu = cuda_make_array(l.delta, outputs*batch);
-    l.output_gpu = cuda_make_array(l.output, outputs*batch);
-    cudaError_t status;
-    status = cudaMalloc((void **)&(l.output_gpu_int8), sizeof(int8_t)*outputs*batch);
-#endif
-
-#ifdef OPENCL
-    l.output_ocl = ocl_make_array(l.output, outputs*batch);
-#endif
     return l;
 }
 
@@ -2204,10 +1346,6 @@ layer make_yolo_layer(int batch, int w, int h, int n, int total, int *mask, int
         l.biases[i] = .5;
     }
 
-#ifdef GPU
-    l.output_gpu = cuda_make_array(l.output, batch*l.outputs);
-#endif
-
     fprintf(stderr, "yolo\n");
     srand(0);
 
@@ -2244,20 +1382,6 @@ region_layer make_region_layer(int batch, int w, int h, int n, int classes, int
     // commented only for this custom version of Yolo v2
     //l.forward = forward_region_layer;
     //l.backward = backward_region_layer;
-#ifdef GPU
-    // commented only for this custom version of Yolo v2
-    //l.forward_gpu = forward_region_layer_gpu;
-    //l.backward_gpu = backward_region_layer_gpu;
-    l.output_gpu = cuda_make_array(l.output, batch*l.outputs);
-    cudaError_t status;
-    status = cudaMalloc((void **)&(l.output_gpu_int8), sizeof(int8_t)*l.outputs*batch);
-    //l.delta_gpu = cuda_make_array(l.delta, batch*l.outputs);
-#endif
-
-#ifdef OPENCL
-    l.output_ocl = ocl_make_array(l.output, batch*l.outputs);
-#endif
-
     fprintf(stderr, "detection\n");
     srand(0);
 
@@ -2292,40 +1416,6 @@ maxpool_layer make_maxpool_layer(int batch, int h, int w, int c, int size, int s
     // commented only for this custom version of Yolo v2
     //l.forward = forward_maxpool_layer;
     //l.backward = backward_maxpool_layer;
-#ifdef GPU
-    // commented only for this custom version of Yolo v2
-    //l.forward_gpu = forward_maxpool_layer_gpu;
-    //l.backward_gpu = backward_maxpool_layer_gpu;
-    l.indexes_gpu = cuda_make_int_array(output_size);
-    l.output_gpu = cuda_make_array(l.output, output_size);
-    cudaError_t status;
-    status = cudaMalloc((void **)&(l.output_gpu_int8), sizeof(int8_t)*output_size);
-    //l.delta_gpu = cuda_make_array(l.delta, output_size);
-
-    cudnnStatus_t maxpool_status;
-    maxpool_status = cudnnCreatePoolingDescriptor(&l.poolingDesc);
-
-    maxpool_status = cudnnSetPooling2dDescriptor(
-        l.poolingDesc,
-        CUDNN_POOLING_MAX,
-        CUDNN_PROPAGATE_NAN,    // CUDNN_PROPAGATE_NAN, CUDNN_NOT_PROPAGATE_NAN
-        l.size,
-        l.size,
-        0, //l.pad,
-        0, //l.pad,
-        l.stride,
-        l.stride);
-
-    cudnnCreateTensorDescriptor(&l.srcTensorDesc);
-    cudnnCreateTensorDescriptor(&l.dstTensorDesc);
-    cudnnSetTensor4dDescriptor(l.srcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l.batch, l.c, l.h, l.w);
-    cudnnSetTensor4dDescriptor(l.dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l.batch, l.out_c, l.out_h, l.out_w);
-#endif
-
-#ifdef OPENCL
-    l.indexes_ocl = ocl_make_int_array(output_size);
-    l.output_ocl = ocl_make_array(l.output, output_size);
-#endif
     fprintf(stderr, "max          %d x %d / %d  %4d x%4d x%4d   ->  %4d x%4d x%4d\n", size, size, stride, w, h, c, l.out_w, l.out_h, l.out_c);
     return l;
 }
@@ -2335,38 +1425,6 @@ maxpool_layer make_maxpool_layer(int batch, int h, int w, int c, int size, int s
 
 // convolutional_layer.c
 size_t get_workspace_size(layer l) {
-#ifdef CUDNN
-    if (gpu_index >= 0) {
-        size_t most = 0;
-        size_t s = 0;
-        cudnnGetConvolutionForwardWorkspaceSize(cudnn_handle(),
-            l.srcTensorDesc,
-            l.weightDesc,
-            l.convDesc,
-            l.dstTensorDesc,
-            l.fw_algo,
-            &s);
-        /*
-        if (s > most) most = s;
-        cudnnGetConvolutionBackwardFilterWorkspaceSize(cudnn_handle(),
-        l.srcTensorDesc,
-        l.ddstTensorDesc,
-        l.convDesc,
-        l.dweightDesc,
-        l.bf_algo,
-        &s);
-        if (s > most) most = s;
-        cudnnGetConvolutionBackwardDataWorkspaceSize(cudnn_handle(),
-        l.weightDesc,
-        l.ddstTensorDesc,
-        l.convDesc,
-        l.dsrcTensorDesc,
-        l.bd_algo,
-        &s);*/
-        if (s > most) most = s;
-        return most;
-    }
-#endif
     if (l.xnor) return (size_t)l.bit_align*l.size*l.size*l.c * sizeof(float);
     return (size_t)l.out_h*l.out_w*l.size*l.size*l.c * sizeof(float);
 }
@@ -2468,86 +1526,6 @@ convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int
         l.v = calloc(c*n*size*size, sizeof(float));
     }
 
-#ifdef GPU
-    // commented only for this custom version of Yolo v2
-    //l.forward_gpu = forward_convolutional_layer_gpu;
-    //l.backward_gpu = backward_convolutional_layer_gpu;
-    //l.update_gpu = update_convolutional_layer_gpu;
-
-    if (gpu_index >= 0) {
-        //if (adam) {
-        //    l.m_gpu = cuda_make_array(l.m, c*n*size*size);
-        //    l.v_gpu = cuda_make_array(l.v, c*n*size*size);
-        //}
-
-        l.weights_gpu = cuda_make_array(l.weights, c*n*size*size);
-        //l.weight_updates_gpu = cuda_make_array(l.weight_updates, c*n*size*size);
-
-        l.biases_gpu = cuda_make_array(l.biases, n);
-        //l.bias_updates_gpu = cuda_make_array(l.bias_updates, n);
-
-        //l.delta_gpu = cuda_make_array(l.delta, l.batch*out_h*out_w*n);
-        l.output_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n);
-
-        cudaError_t status;
-        status = cudaMalloc((void **)&(l.output_gpu_int8), sizeof(int8_t)*l.batch*out_h*out_w*n);
-
-        //if (binary) {
-        //    l.binary_weights_gpu = cuda_make_array(l.weights, c*n*size*size);
-        //}
-        if (xnor) {
-            l.binary_weights_gpu = cuda_make_array(l.weights, c*n*size*size);
-            l.binary_input_gpu = cuda_make_array(0, l.inputs*l.batch);
-        }
-
-        if (batch_normalize) {
-            //l.mean_gpu = cuda_make_array(l.mean, n);
-            //l.variance_gpu = cuda_make_array(l.variance, n);
-
-            l.rolling_mean_gpu = cuda_make_array(l.mean, n);
-            l.rolling_variance_gpu = cuda_make_array(l.variance, n);
-
-            //l.mean_delta_gpu = cuda_make_array(l.mean, n);
-            //l.variance_delta_gpu = cuda_make_array(l.variance, n);
-
-            l.scales_gpu = cuda_make_array(l.scales, n);
-            //l.scale_updates_gpu = cuda_make_array(l.scale_updates, n);
-
-            l.x_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n);
-            //l.x_norm_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n);
-        }
-#ifdef CUDNN
-        cudnnCreateTensorDescriptor(&l.biasTensorDesc);
-        cudnnCreateActivationDescriptor(&l.activationDesc);
-        cudnnCreateTensorDescriptor(&l.srcTensorDesc);
-        cudnnCreateTensorDescriptor(&l.dstTensorDesc);
-        cudnnCreateFilterDescriptor(&l.weightDesc);
-        //cudnnCreateTensorDescriptor(&l.dsrcTensorDesc);
-        //cudnnCreateTensorDescriptor(&l.ddstTensorDesc);
-        //cudnnCreateFilterDescriptor(&l.dweightDesc);
-        cudnnCreateConvolutionDescriptor(&l.convDesc);
-        cudnn_convolutional_setup(&l);
-#endif
-    }
-#endif
-
-#ifdef OPENCL
-    //if (gpu_index >= 0) {
-
-    l.weights_ocl = ocl_make_array(l.weights, c*n*size*size);
-    l.biases_ocl = ocl_make_array(l.biases, n);
-    l.output_ocl = ocl_make_array(l.output, l.batch*out_h*out_w*n);
-
-    if (batch_normalize) {
-        l.rolling_mean_ocl = ocl_make_array(l.rolling_mean, n);    // l.mean
-        l.rolling_variance_ocl = ocl_make_array(l.rolling_variance, n);    // l.variance
-        l.scales_ocl = ocl_make_array(l.scales, n);
-
-        l.x_ocl = ocl_make_array(l.output, l.batch*out_h*out_w*n);
-    }
-    //}
-#endif
-
     l.workspace_size = get_workspace_size(l);
     l.activation = activation;
 
@@ -3117,35 +2095,11 @@ void load_convolutional_weights_cpu(layer l, FILE *fp)
         fread(l.rolling_variance, sizeof(float), l.n, fp);
     }
     fread(l.weights, sizeof(float), num, fp);
-    /*    if (l.adam) {
-    fread(l.m, sizeof(float), num, fp);
-    fread(l.v, sizeof(float), num, fp);
-    }
-    if (l.flipped) {
-    transpose_matrix(l.weights, l.c*l.size*l.size, l.n);
-    }*/
-    //if (l.binary) binarize_weights(l.weights, l.n, l.c*l.size*l.size, l.weights);
-#ifdef GPU
-    if (gpu_index >= 0) {
-        push_convolutional_layer(l);
-    }
-#endif
-
-#ifdef OPENCL
-    //if (gpu_index >= 0) {
-    ocl_push_convolutional_layer(l);
-    //}
-#endif
 }
 
 // parser.c
 void load_weights_upto_cpu(network *net, char *filename, int cutoff)
 {
-#ifdef GPU
-    if (net->gpu_index >= 0) {
-        cuda_set_device(net->gpu_index);
-    }
-#endif
     fprintf(stderr, "Loading weights from %s...", filename);
     fflush(stdout);
     FILE *fp = fopen(filename, "rb");
@@ -3704,26 +2658,7 @@ network parse_network_cfg(char *filename, int batch, int quantized)
     net.output = get_network_output(net);
     if (workspace_size) {
         //printf("%ld\n", workspace_size);
-#ifdef GPU
-        if (gpu_index >= 0) {
-            net.workspace = cuda_make_array(0, (workspace_size - 1) / sizeof(float) + 1);
-            int size = net.layers[0].inputs * net.batch;    //get_network_input_size(net) * net.batch;
-            net.input_state_gpu = cuda_make_array(0, size);
-        }
-        else {
-            net.workspace = calloc(1, workspace_size);
-        }
-#else    // GPU
         net.workspace = calloc(1, workspace_size);
-#endif    // GPU
-
-#ifdef OPENCL
-        //if (gpu_index >= 0) {
-        net.workspace_ocl = ocl_make_array(0, workspace_size / sizeof(float));
-        //net.workspace_ocl = ocl_make_array(0, (workspace_size - 1) / sizeof(float) + 1);
-        //net.workspace_ocl = ocl_make_array(NULL, 1024*1024*1024);
-        //}
-#endif    // OPENCL
     }
     return net;
 }
@@ -4281,18 +3216,6 @@ void validate_detector_map(char *datacfg, char *cfgfile, char *weightfile, float
             //char *id = basecfg(path);
             float *X = val_resized[t].data;
             //network_predict(net, X);
-#ifdef GPU
-            if (quantized) {
-                network_predict_gpu_cudnn_quantized(net, X);    // quantized
-                //nms = 0.2;
-            }
-            else {
-                network_predict_gpu_cudnn(net, X);
-            }
-#else    // GPU
-#ifdef OPENCL
-            network_predict_opencl(net, X);
-#else    // OPENCL
             if (quantized) {
                 network_predict_quantized(net, X);    // quantized
                 //nms = 0.2;
@@ -4300,8 +3223,6 @@ void validate_detector_map(char *datacfg, char *cfgfile, char *weightfile, float
             else {
                 network_predict_cpu(net, X);
             }
-#endif    // OPENCL
-#endif    // GPU
 
             int nboxes = 0;
             int letterbox = (args.type == LETTERBOX_DATA);
@@ -4566,17 +3487,6 @@ void validate_calibrate_valid(char *datacfg, char *cfgfile, char *weightfile, in
     yolov2_fuse_conv_batchnorm(net);
     srand(time(0));
 
-#ifdef GPU
-    size_t workspace_size = 0;
-    for (j = 0; j < net.n; ++j) {
-        layer l = net.layers[j];
-        size_t cur_workspace_size = (size_t)l.out_h*l.out_w*l.size*l.size*l.c * sizeof(float);
-        if (cur_workspace_size > workspace_size) workspace_size = cur_workspace_size;
-    }
-    cudaFree(net.workspace);
-    net.workspace = calloc(1, workspace_size);
-#endif // GPU
-
     list *plist = get_paths(valid_images);
     char **paths = (char **)list_to_array(plist);
 
diff --git a/src/additionally.h b/src/additionally.h
index daf461f..f82ca92 100644
--- a/src/additionally.h
+++ b/src/additionally.h
@@ -14,20 +14,6 @@
 #include <limits.h>
 #include <stdint.h>
 
-#ifdef CUDNN
-#include "cudnn.h"
-#endif
-
-#ifdef GPU
-#include "cuda_runtime.h"
-#include "curand.h"
-#include "cublas_v2.h"
-#endif
-
-#ifdef OPENCL
-#include "CL/cl.h"
-#endif
-
 #ifdef OPENCV
 #include <opencv2/core/fast_math.hpp>
 #include "opencv2/highgui/highgui_c.h"
@@ -184,46 +170,6 @@ extern "C" {
     // float32 to bit-1 and align weights for ALL layers
     void calculate_binary_weights(struct network net);
 
-    // -------------- XNOR-net GPU ------------
-
-#ifdef GPU
-    void swap_binary(convolutional_layer *l);
-
-    void binarize_weights_gpu(float *weights, int n, int size, float *binary);
-
-    void binarize_gpu(float *x, int n, float *binary);
-
-    void im2col_align_ongpu(float *im,
-        int channels, int height, int width,
-        int ksize, int stride, int pad, float *data_col, int bit_align);
-
-    void im2col_align_bin_ongpu(float *im,
-        int channels, int height, int width,
-        int ksize, int stride, int pad, float *data_col, int bit_align);
-
-    void float_to_bit_gpu(float *src, unsigned char *dst, size_t size);
-
-    void transpose_bin_gpu(unsigned char *A, unsigned char *B, const int n, const int m,
-        const int lda, const int ldb, const int block_size);
-
-    void fill_int8_gpu(unsigned char *src, unsigned char val, size_t size);
-
-    //void gemm_nn_custom_bin_mean_transposed_gpu(int M, int N, int K,
-    //    unsigned char *A, int lda,
-    //    unsigned char *B, int ldb,
-    //    float *C, int ldc, float *mean_arr);
-
-    void gemm_nn_custom_bin_mean_transposed_gpu(int M, int N, int K,
-        unsigned char *A, int lda,
-        unsigned char *B, int ldb,
-        float *C, int ldc, float *mean_arr, float *bias);
-
-    void gemm_nn_custom_bin_mean_transposed_sequentially_gpu(int M, int N, int K,
-        unsigned char *A, int lda,
-        unsigned char *B, int ldb,
-        float *C, int ldc, float *mean_arr);
-
-#endif // GPU
 
     // -------------- blas.h --------------
 
@@ -611,85 +557,6 @@ extern "C" {
         float *binary_input;
 
         size_t workspace_size;
-
-#ifdef GPU
-        float *z_gpu;
-        float *r_gpu;
-        float *h_gpu;
-
-        int *indexes_gpu;
-        float * prev_state_gpu;
-        float * forgot_state_gpu;
-        float * forgot_delta_gpu;
-        float * state_gpu;
-        float * state_delta_gpu;
-        float * gate_gpu;
-        float * gate_delta_gpu;
-        float * save_gpu;
-        float * save_delta_gpu;
-        float * concat_gpu;
-        float * concat_delta_gpu;
-
-        float *binary_input_gpu;
-        float *binary_weights_gpu;
-
-        float * mean_gpu;
-        float * variance_gpu;
-
-        float * rolling_mean_gpu;
-        float * rolling_variance_gpu;
-
-        float * variance_delta_gpu;
-        float * mean_delta_gpu;
-
-        float * col_image_gpu;
-
-        float * x_gpu;
-        float * x_norm_gpu;
-        float * weights_gpu;
-        //float * weight_updates_gpu;
-        int8_t * weights_int8_gpu;
-        int8_t * weights_int8_int8x4_gpu;
-
-        float * biases_gpu;
-        //float * bias_updates_gpu;
-        float * biases_quant_gpu;
-
-        float * scales_gpu;
-        //float * scale_updates_gpu;
-
-        float * output_gpu;
-        int8_t *output_gpu_int8;
-        float * delta_gpu;
-        float * rand_gpu;
-        float * squared_gpu;
-        float * norms_gpu;
-#ifdef CUDNN
-        cudnnTensorDescriptor_t biasTensorDesc;
-        cudnnActivationDescriptor_t activationDesc;
-        cudnnTensorDescriptor_t srcTensorDesc, dstTensorDesc;
-        //cudnnTensorDescriptor_t dsrcTensorDesc, ddstTensorDesc;
-        cudnnFilterDescriptor_t weightDesc;
-        //cudnnFilterDescriptor_t dweightDesc;
-        cudnnConvolutionDescriptor_t convDesc;
-        cudnnConvolutionFwdAlgo_t fw_algo;
-        //cudnnConvolutionBwdDataAlgo_t bd_algo;
-        //cudnnConvolutionBwdFilterAlgo_t bf_algo;
-        cudnnPoolingDescriptor_t poolingDesc;
-#endif
-#endif
-
-#ifdef OPENCL
-        cl_mem weights_ocl;
-        cl_mem biases_ocl;
-        cl_mem scales_ocl;
-        cl_mem rolling_mean_ocl;
-        cl_mem rolling_variance_ocl;
-
-        cl_mem output_ocl;
-        cl_mem indexes_ocl;
-        cl_mem x_ocl;
-#endif
     };
 
     typedef layer local_layer;
@@ -756,17 +623,6 @@ extern "C" {
         int gpu_index;
         tree *hierarchy;
         int do_input_calibration;
-
-#ifdef GPU
-        float *input_state_gpu;
-
-        float **input_gpu;
-        float **truth_gpu;
-#endif
-
-#ifdef OPENCL
-        cl_mem workspace_ocl;
-#endif
     } network;
 
     typedef struct network_state {
@@ -778,29 +634,12 @@ extern "C" {
         int train;
         int index;
         network net;
-#ifdef OPENCL
-        cl_mem input_ocl;
-        cl_mem workspace_ocl;
-#endif
     } network_state;
 
 
     // network.c
     network make_network(int n);
 
-
-    // network.c
-#ifdef GPU
-#ifdef CUDNN
-    void cudnn_convolutional_setup(layer *l);
-    void cuda_set_device(int n);
-#endif
-#endif
-
-#ifdef OPENCL
-    bool ocl_initialize();
-#endif
-
     // network.c
     void set_batch_network(network *net, int b);
 
@@ -957,25 +796,6 @@ extern "C" {
 
     // -------------- yolov2_forward_network_gpu.c --------------------
 
-#ifdef GPU
-    // detect on GPU: yolov2_forward_network_gpu.cu
-    float *network_predict_gpu_cudnn(network net, float *input);
-
-    // detect on GPU: yolov2_forward_network_gpu.cu - quantized INT8x4
-    float *network_predict_gpu_cudnn_quantized(network net, float *input);
-
-    // // init weights and cuDNN for quantized IINT8x4
-    void init_gpu_int8x4(network net);
-#endif
-
-    // -------------- yolov2_forward_network_ocl.c --------------------
-
-#ifdef OPENCL
-    // detect using OpenCL: yolov2_forward_network_gpu.cpp
-    float *network_predict_opencl(network net, float *input);
-#endif
-
-
     // -------------- gettimeofday for Windows--------------------
 
 #if defined(_MSC_VER)
diff --git a/src/main.c b/src/main.c
index 95dbb5a..959ca22 100644
--- a/src/main.c
+++ b/src/main.c
@@ -110,13 +110,6 @@ void draw_detections_v3(image im, detection *dets, int num, float thresh, char *
         if (width < 1)
             width = 1;
 
-        /*
-        if(0){
-        width = pow(prob, 1./2.)*10+1;
-        alphabet = 0;
-        }
-        */
-
         //printf("%d %s: %.0f%%\n", i, names[selected_detections[i].best_class], prob*100);
         int offset = selected_detections[i].best_class * 123457 % classes;
         float red = get_color(2, offset, classes);
@@ -195,19 +188,8 @@ void test_detector_cpu(char **names, char *cfgfile, char *weightfile, char *file
 
         float *X = sized.data;
         time = clock();
-        //network_predict(net, X);
-#ifdef GPU
-        if (quantized) {
-            network_predict_gpu_cudnn_quantized(net, X);    // quantized works only with Yolo v2
-                                                            //nms = 0.2;
-        }
-        else {
-            network_predict_gpu_cudnn(net, X);
-        }
-#else
-#ifdef OPENCL
-        network_predict_opencl(net, X);
-#else
+        
+		//network_predict(net, X);
         if (quantized) {
             network_predict_quantized(net, X);    // quantized works only with Yolo v2
             nms = 0.2;
@@ -215,9 +197,8 @@ void test_detector_cpu(char **names, char *cfgfile, char *weightfile, char *file
         else {
             network_predict_cpu(net, X);
         }
-#endif
-#endif
-        printf("%s: Predicted in %f seconds.\n", input, (float)(clock() - time) / CLOCKS_PER_SEC); //sec(clock() - time));
+
+		printf("%s: Predicted in %f seconds.\n", input, (float)(clock() - time) / CLOCKS_PER_SEC); //sec(clock() - time));
         //get_region_boxes_cpu(l, 1, 1, thresh, probs, boxes, 0, 0);            // get_region_boxes(): region_layer.c
 
         //  nms (non maximum suppression) - if (IoU(box[i], box[j]) > nms) then remove one of two boxes with lower probability
@@ -238,6 +219,7 @@ void test_detector_cpu(char **names, char *cfgfile, char *weightfile, char *file
         free_image(sized);                // image.c
         free(boxes);
         free_ptrs((void **)probs, l.w*l.h*l.n);    // utils.c
+
 #ifdef OPENCV
         cvWaitKey(0);
         cvDestroyAllWindows();
@@ -246,340 +228,6 @@ void test_detector_cpu(char **names, char *cfgfile, char *weightfile, char *file
     }
 }
 
-
-// --------------- Detect on the Video ---------------
-
-#ifdef OPENCV
-static char **demo_names;
-static int demo_classes;
-static int demo_quantized;
-
-static float **probs;
-static box *boxes;
-static network net;
-static image in;
-static image in_s;
-static image det;
-static image det_s;
-static image disp = { 0 };
-static CvCapture * cap;
-static float fps = 0;
-static float demo_thresh = 0;
-
-IplImage* in_img;
-IplImage* det_img;
-IplImage* show_img;
-
-// draw bounded boxes of found objects on the image, from: image.c
-void draw_detections_cv_v3(IplImage* show_img, detection *dets, int num, float thresh, char **names, image **alphabet, int classes, int ext_output)
-{
-    int i, j;
-    if (!show_img) return;
-    static int frame_id = 0;
-    frame_id++;
-
-    for (i = 0; i < num; ++i) {
-        char labelstr[4096] = { 0 };
-        int class_id = -1;
-        for (j = 0; j < classes; ++j) {
-            if (dets[i].prob[j] > thresh) {
-                if (class_id < 0) {
-                    strcat(labelstr, names[j]);
-                    class_id = j;
-                }
-                else {
-                    strcat(labelstr, ", ");
-                    strcat(labelstr, names[j]);
-                }
-                printf("%s: %.0f%% ", names[j], dets[i].prob[j] * 100);
-            }
-        }
-        if (class_id >= 0) {
-            int width = show_img->height * .006;
-
-            //printf("%d %s: %.0f%%\n", i, names[class_id], prob*100);
-            int offset = class_id * 123457 % classes;
-            float red = get_color(2, offset, classes);
-            float green = get_color(1, offset, classes);
-            float blue = get_color(0, offset, classes);
-            float rgb[3];
-
-            //width = prob*20+2;
-
-            rgb[0] = red;
-            rgb[1] = green;
-            rgb[2] = blue;
-            box b = dets[i].bbox;
-            //printf("%f %f %f %f\n", b.x, b.y, b.w, b.h);
-
-            int left = (b.x - b.w / 2.)*show_img->width;
-            int right = (b.x + b.w / 2.)*show_img->width;
-            int top = (b.y - b.h / 2.)*show_img->height;
-            int bot = (b.y + b.h / 2.)*show_img->height;
-
-            if (left < 0) left = 0;
-            if (right > show_img->width - 1) right = show_img->width - 1;
-            if (top < 0) top = 0;
-            if (bot > show_img->height - 1) bot = show_img->height - 1;
-
-            float const font_size = show_img->height / 1000.F;
-            CvPoint pt1, pt2, pt_text, pt_text_bg1, pt_text_bg2;
-            pt1.x = left;
-            pt1.y = top;
-            pt2.x = right;
-            pt2.y = bot;
-            pt_text.x = left;
-            pt_text.y = top - 12;
-            pt_text_bg1.x = left;
-            pt_text_bg1.y = top - (10 + 25 * font_size);
-            pt_text_bg2.x = right;
-            pt_text_bg2.y = top;
-            CvScalar color;
-            color.val[0] = red * 256;
-            color.val[1] = green * 256;
-            color.val[2] = blue * 256;
-
-            cvRectangle(show_img, pt1, pt2, color, width, 8, 0);
-            if (ext_output)
-                printf("\t(left_x: %4.0f   top_y: %4.0f   width: %4.0f   height: %4.0f)\n",
-                (float)left, (float)top, b.w*show_img->width, b.h*show_img->height);
-            else
-                printf("\n");
-            cvRectangle(show_img, pt_text_bg1, pt_text_bg2, color, width, 8, 0);
-            cvRectangle(show_img, pt_text_bg1, pt_text_bg2, color, CV_FILLED, 8, 0);    // filled
-            CvScalar black_color;
-            black_color.val[0] = 0;
-            CvFont font;
-            cvInitFont(&font, CV_FONT_HERSHEY_SIMPLEX, font_size, font_size, 0, font_size * 3, 8);
-            cvPutText(show_img, labelstr, pt_text, &font, black_color);
-        }
-    }
-}
-
-
-
-image get_image_from_stream_resize_cpu(CvCapture *cap, int w, int h, IplImage** in_img)
-{
-    IplImage* src = cvQueryFrame(cap);
-    if (!src) return make_empty_image(0, 0, 0);
-    IplImage* new_img = cvCreateImage(cvSize(w, h), IPL_DEPTH_8U, 3);
-    *in_img = cvCreateImage(cvSize(src->width, src->height), IPL_DEPTH_8U, 3);
-    cvResize(src, *in_img, CV_INTER_LINEAR);
-    cvResize(src, new_img, CV_INTER_LINEAR);
-    image im = ipl_to_image(new_img);
-    cvReleaseImage(&new_img);
-    rgbgr_image(im);
-    return im;
-}
-
-static void *fetch_in_thread(void *ptr)
-{
-    in = get_image_from_stream_resize_cpu(cap, net.w, net.h, &in_img);    // image.c
-    if (!in.data) {
-        error("Stream closed.");
-    }
-    in_s = make_image(in.w, in.h, in.c);    // image.c
-    memcpy(in_s.data, in.data, in.h*in.w*in.c * sizeof(float));
-
-    return 0;
-}
-
-static void *detect_in_thread(void *ptr)
-{
-    float nms = .4;
-    layer l = net.layers[net.n - 1];
-    float *X = det_s.data;
-
-    //float *prediction = network_predict(net, X);
-#ifdef GPU
-    if (demo_quantized) {
-        network_predict_gpu_cudnn_quantized(net, X);    // quantized works only with Yolo v2
-                                                        //nms = 0.2;
-    }
-    else {
-        network_predict_gpu_cudnn(net, X);
-    }
-#else
-#ifdef OPENCL
-    network_predict_opencl(net, X);
-#else
-    if (demo_quantized) {
-        network_predict_quantized(net, X);    // quantized works only with Yolo v2
-        nms = 0.2;
-    }
-    else {
-        network_predict_cpu(net, X);
-    }
-#endif
-#endif
-
-    free_image(det_s);
-    //get_region_boxes_cpu(l, 1, 1, demo_thresh, probs, boxes, 0, 0);        // get_region_boxes(): region_layer.c
-    //if (nms) do_nms_sort(boxes, probs, l.w*l.h*l.n, l.classes, nms);    // box.c
-    float hier_thresh = 0.5;
-    int ext_output = 1, letterbox = 0, nboxes = 0;
-    detection *dets = NULL;
-    if (letterbox)
-        dets = get_network_boxes(&net, in_img->width, in_img->height, demo_thresh, demo_thresh, 0, 1, &nboxes, 1); // letter box
-    else
-        dets = get_network_boxes(&net, det_s.w, det_s.h, demo_thresh, demo_thresh, 0, 1, &nboxes, 0); // resized
-                                                                                                      //if (nms) do_nms_obj(dets, nboxes, l.classes, nms);    // bad results
-    if (nms) do_nms_sort(dets, nboxes, l.classes, nms);
-    draw_detections_cv_v3(det_img, dets, nboxes, demo_thresh, demo_names, NULL, demo_classes, ext_output);
-    free_detections(dets, nboxes);
-
-    printf("\033[2J");
-    printf("\033[1;1H");
-    printf("\nFPS:%.1f\n", fps);
-    printf("Objects:\n\n");
-
-    return 0;
-}
-
-static double get_wall_time()
-{
-    struct timeval time;
-    if (gettimeofday(&time, NULL)) {
-        return 0;
-    }
-    return (double)time.tv_sec + (double)time.tv_usec * .000001;
-}
-
-
-// Detect on Video: this function uses other functions not from this file
-void demo(char *cfgfile, char *weightfile, float thresh, int cam_index, const char *filename, char **names, int classes,
-    int frame_skip, char *prefix, int quantized, char *out_filename, int dont_show)
-{
-    int delay = frame_skip;
-    demo_names = names;
-    demo_classes = classes;
-    demo_thresh = thresh;
-    printf("Demo\n");
-    net = parse_network_cfg(cfgfile, 1, quantized);
-    if (weightfile) {
-        //load_weights(&net, weightfile);            // parser.c
-        load_weights_upto_cpu(&net, weightfile, net.n);
-    }
-    //set_batch_network(&net, 1);
-    yolov2_fuse_conv_batchnorm(net);
-    calculate_binary_weights(net);
-    if (quantized) {
-        printf("\n\n Quantinization! \n\n");
-        demo_quantized = 1;
-        quantinization_and_get_multipliers(net);
-    }
-    srand(2222222);
-
-    if (filename) {
-        printf("video file: %s\n", filename);
-        cap = cvCaptureFromFile(filename);
-    }
-    else {
-        cap = cvCaptureFromCAM(cam_index);
-    }
-
-    if (!cap) error("Couldn't connect to webcam.\n");
-
-    layer l = net.layers[net.n - 1];
-    int j;
-
-    boxes = (box *)calloc(l.w*l.h*l.n, sizeof(box));
-    probs = (float **)calloc(l.w*l.h*l.n, sizeof(float *));
-    for (j = 0; j < l.w*l.h*l.n; ++j) probs[j] = (float *)calloc(l.classes, sizeof(float *));
-
-    pthread_t fetch_thread;
-    pthread_t detect_thread;
-
-    fetch_in_thread(0);
-    det_img = in_img;
-    det = in;
-    det_s = in_s;
-
-    fetch_in_thread(0);
-    detect_in_thread(0);
-    disp = det;
-    show_img = det_img;
-    det_img = in_img;
-    det = in;
-    det_s = in_s;
-
-    int count = 0;
-    if (!prefix && !dont_show) {
-        cvNamedWindow("Demo", CV_WINDOW_NORMAL);
-        cvMoveWindow("Demo", 0, 0);
-        cvResizeWindow("Demo", 1352, 1013);
-    }
-
-    CvVideoWriter* output_video_writer = NULL;    // cv::VideoWriter output_video;
-    if (out_filename)
-    {
-        CvSize size;
-        size.width = det_img->width, size.height = det_img->height;
-        int src_fps = 25;
-        src_fps = cvGetCaptureProperty(cap, CV_CAP_PROP_FPS);
-        output_video_writer = cvCreateVideoWriter(out_filename, CV_FOURCC('D', 'I', 'V', 'X'), src_fps, size, 1);
-    }
-
-    double before = get_wall_time();
-
-    while (1) {
-        ++count;
-        if (pthread_create(&fetch_thread, 0, fetch_in_thread, 0)) error("Thread creation failed");
-        if (pthread_create(&detect_thread, 0, detect_in_thread, 0)) error("Thread creation failed");
-
-        if (!prefix) {
-            if (!dont_show) {
-                //show_image(disp, "Demo");
-                show_image_cv_ipl(show_img, "Demo");
-                int c = cvWaitKey(1);
-            }
-        }
-        else {
-            char buff[256];
-            sprintf(buff, "%s_%08d", prefix, count);
-            save_image_png(disp, buff);
-        }
-
-        // save video file
-        if (output_video_writer && show_img) {
-            cvWriteFrame(output_video_writer, show_img);
-            //printf("\n cvWriteFrame \n");
-        }
-
-        cvReleaseImage(&show_img);
-
-        pthread_join(fetch_thread, 0);
-        pthread_join(detect_thread, 0);
-
-        if (delay == 0) {
-            free_image(disp);
-            disp = det;
-            show_img = det_img;
-        }
-        det_img = in_img;
-        det = in;
-        det_s = in_s;
-
-        --delay;
-        if (delay < 0) {
-            delay = frame_skip;
-
-            double after = get_wall_time();
-            float curr = 1. / (after - before);
-            fps = curr;
-            before = after;
-        }
-    }
-}
-#else
-void demo(char *cfgfile, char *weightfile, float thresh, int cam_index, const char *filename, char **names, int classes,
-    int frame_skip, char *prefix, int quantized, char *out_filename, int dont_show)
-{
-    fprintf(stderr, "Demo needs OpenCV for webcam images.\n");
-}
-#endif
-
-
 // get command line parameters and load objects names
 void run_detector(int argc, char **argv)
 {
@@ -592,8 +240,9 @@ void run_detector(int argc, char **argv)
     int quantized = find_arg(argc, argv, "-quantized");
     int input_calibration = find_int_arg(argc, argv, "-input_calibration", 0);
     int frame_skip = find_int_arg(argc, argv, "-s", 0);
-    if (argc < 4) {
-        fprintf(stderr, "usage: %s %s [demo/test/] [cfg] [weights (optional)]\n", argv[0], argv[1]);
+    
+	if (argc < 4) {
+        fprintf(stderr, "usage: %s %s [test] [cfg] [weights (optional)]\n", argv[0], argv[1]);
         return;
     }
 
@@ -620,15 +269,7 @@ void run_detector(int argc, char **argv)
     int classes = obj_count;
 
     if (0 == strcmp(argv[2], "test")) test_detector_cpu(names, cfg, weights, filename, thresh, quantized, dont_show);
-    //else if (0 == strcmp(argv[2], "train")) train_detector(datacfg, cfg, weights, gpus, ngpus, clear);
-    //else if (0 == strcmp(argv[2], "valid")) validate_detector(datacfg, cfg, weights);
-    //else if (0 == strcmp(argv[2], "recall")) validate_detector_recall(datacfg, cfg, weights);
-    else if (0 == strcmp(argv[2], "map")) validate_detector_map(obj_names, cfg, weights, thresh, quantized, iou_thresh);
-    else if (0 == strcmp(argv[2], "calibrate")) validate_calibrate_valid(obj_names, cfg, weights, input_calibration);
-    else if (0 == strcmp(argv[2], "demo")) {
-        demo(cfg, weights, thresh, cam_index, filename, names, classes, frame_skip, prefix, quantized, out_filename, dont_show);
-    }
-
+    
     int i;
     for (i = 0; i < obj_count; ++i) free(names[i]);
     free(names);
diff --git a/x64/cpu_Release/additionally.obj b/x64/cpu_Release/additionally.obj
new file mode 100644
index 0000000..5ce2b62
Binary files /dev/null and b/x64/cpu_Release/additionally.obj differ
diff --git a/x64/cpu_Release/box.obj b/x64/cpu_Release/box.obj
new file mode 100644
index 0000000..4b3c793
Binary files /dev/null and b/x64/cpu_Release/box.obj differ
diff --git a/x64/cpu_Release/main.obj b/x64/cpu_Release/main.obj
new file mode 100644
index 0000000..2147715
Binary files /dev/null and b/x64/cpu_Release/main.obj differ
diff --git a/x64/cpu_Release/vc140.pdb b/x64/cpu_Release/vc140.pdb
new file mode 100644
index 0000000..838640e
Binary files /dev/null and b/x64/cpu_Release/vc140.pdb differ
diff --git a/x64/cpu_Release/yolo_cpu.Build.CppClean.log b/x64/cpu_Release/yolo_cpu.Build.CppClean.log
new file mode 100644
index 0000000..3b764e4
--- /dev/null
+++ b/x64/cpu_Release/yolo_cpu.Build.CppClean.log
@@ -0,0 +1,16 @@
+f:\github\yolo2_light_cpu\x64\cpu_release\yolov2_forward_network.obj
+f:\github\yolo2_light_cpu\x64\cpu_release\yolov2_forward_network_quantized.obj
+f:\github\yolo2_light_cpu\x64\cpu_release\main.obj
+f:\github\yolo2_light_cpu\x64\cpu_release\additionally.obj
+f:\github\yolo2_light_cpu\x64\cpu_release\box.obj
+f:\github\yolo2_light_cpu\x64\cpu_release\vc140.pdb
+f:\github\yolo2_light_cpu\bin\yolo_cpu.exe
+f:\github\yolo2_light_cpu\bin\yolo_cpu.ipdb
+f:\github\yolo2_light_cpu\bin\yolo_cpu.iobj
+f:\github\yolo2_light_cpu\bin\yolo_cpu.pdb
+f:\github\yolo2_light_cpu\x64\cpu_release\yolo_cpu.tlog\cl.command.1.tlog
+f:\github\yolo2_light_cpu\x64\cpu_release\yolo_cpu.tlog\cl.read.1.tlog
+f:\github\yolo2_light_cpu\x64\cpu_release\yolo_cpu.tlog\cl.write.1.tlog
+f:\github\yolo2_light_cpu\x64\cpu_release\yolo_cpu.tlog\link.command.1.tlog
+f:\github\yolo2_light_cpu\x64\cpu_release\yolo_cpu.tlog\link.read.1.tlog
+f:\github\yolo2_light_cpu\x64\cpu_release\yolo_cpu.tlog\link.write.1.tlog
diff --git a/x64/cpu_Release/yolo_cpu.log b/x64/cpu_Release/yolo_cpu.log
new file mode 100644
index 0000000..4a698a7
--- /dev/null
+++ b/x64/cpu_Release/yolo_cpu.log
@@ -0,0 +1,198 @@
+﻿  additionally.c
+  box.c
+  main.c
+  yolov2_forward_network.c
+  yolov2_forward_network_quantized.c
+src\box.c(104): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data
+src\box.c(139): warning C4305: 'initializing': truncation from 'double' to 'float'
+src\box.c(140): warning C4305: 'initializing': truncation from 'double' to 'float'
+src\box.c(141): warning C4305: 'initializing': truncation from 'double' to 'float'
+src\box.c(142): warning C4305: 'initializing': truncation from 'double' to 'float'
+src\box.c(144): warning C4305: 'initializing': truncation from 'double' to 'float'
+src\box.c(161): warning C4305: 'initializing': truncation from 'double' to 'float'
+src\box.c(162): warning C4305: 'initializing': truncation from 'double' to 'float'
+src\box.c(163): warning C4305: 'initializing': truncation from 'double' to 'float'
+src\box.c(164): warning C4305: 'initializing': truncation from 'double' to 'float'
+src\box.c(166): warning C4305: 'initializing': truncation from 'double' to 'float'
+src\box.c(186): warning C4305: 'initializing': truncation from 'double' to 'float'
+src\box.c(187): warning C4305: 'initializing': truncation from 'double' to 'float'
+src\box.c(188): warning C4305: 'initializing': truncation from 'double' to 'float'
+src\box.c(189): warning C4305: 'initializing': truncation from 'double' to 'float'
+src\box.c(191): warning C4305: 'initializing': truncation from 'double' to 'float'
+src\box.c(226): warning C4244: '=': conversion from 'double' to 'float', possible loss of data
+src\box.c(227): warning C4244: '=': conversion from 'double' to 'float', possible loss of data
+src\box.c(228): warning C4244: '=': conversion from 'double' to 'float', possible loss of data
+src\box.c(229): warning C4244: '=': conversion from 'double' to 'float', possible loss of data
+src\box.c(355): warning C4244: '=': conversion from 'double' to 'float', possible loss of data
+src\box.c(356): warning C4244: '=': conversion from 'double' to 'float', possible loss of data
+src\box.c(365): warning C4244: '=': conversion from 'double' to 'float', possible loss of data
+src\box.c(366): warning C4244: '=': conversion from 'double' to 'float', possible loss of data
+f:\github\yolo2_light_cpu\src\additionally.h(61): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data (compiling source file src\yolov2_forward_network.c)
+f:\github\yolo2_light_cpu\src\additionally.h(62): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data (compiling source file src\yolov2_forward_network.c)
+f:\github\yolo2_light_cpu\src\additionally.h(61): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data (compiling source file src\additionally.c)
+f:\github\yolo2_light_cpu\src\additionally.h(60): warning C4244: 'initializing': conversion from 'double' to 'int', possible loss of data (compiling source file src\yolov2_forward_network.c)
+f:\github\yolo2_light_cpu\src\additionally.h(61): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data (compiling source file src\yolov2_forward_network_quantized.c)
+f:\github\yolo2_light_cpu\src\additionally.h(62): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data (compiling source file src\additionally.c)
+f:\github\yolo2_light_cpu\src\additionally.h(61): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data (compiling source file src\main.c)
+f:\github\yolo2_light_cpu\src\additionally.h(71): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data (compiling source file src\yolov2_forward_network.c)
+f:\github\yolo2_light_cpu\src\additionally.h(62): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data (compiling source file src\yolov2_forward_network_quantized.c)
+f:\github\yolo2_light_cpu\src\additionally.h(60): warning C4244: 'initializing': conversion from 'double' to 'int', possible loss of data (compiling source file src\additionally.c)
+f:\github\yolo2_light_cpu\src\additionally.h(62): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data (compiling source file src\main.c)
+f:\github\yolo2_light_cpu\src\additionally.h(72): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data (compiling source file src\yolov2_forward_network.c)
+f:\github\yolo2_light_cpu\src\additionally.h(60): warning C4244: 'initializing': conversion from 'double' to 'int', possible loss of data (compiling source file src\yolov2_forward_network_quantized.c)
+f:\github\yolo2_light_cpu\src\additionally.h(71): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data (compiling source file src\additionally.c)
+f:\github\yolo2_light_cpu\src\additionally.h(60): warning C4244: 'initializing': conversion from 'double' to 'int', possible loss of data (compiling source file src\main.c)
+f:\github\yolo2_light_cpu\src\additionally.h(74): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data (compiling source file src\yolov2_forward_network.c)
+f:\github\yolo2_light_cpu\src\additionally.h(71): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data (compiling source file src\yolov2_forward_network_quantized.c)
+f:\github\yolo2_light_cpu\src\additionally.h(72): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data (compiling source file src\additionally.c)
+f:\github\yolo2_light_cpu\src\additionally.h(71): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data (compiling source file src\main.c)
+f:\github\yolo2_light_cpu\src\additionally.h(75): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data (compiling source file src\yolov2_forward_network.c)
+f:\github\yolo2_light_cpu\src\additionally.h(72): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data (compiling source file src\yolov2_forward_network_quantized.c)
+f:\github\yolo2_light_cpu\src\additionally.h(74): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data (compiling source file src\additionally.c)
+f:\github\yolo2_light_cpu\src\additionally.h(72): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data (compiling source file src\main.c)
+f:\github\yolo2_light_cpu\src\additionally.h(76): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data (compiling source file src\yolov2_forward_network.c)
+f:\github\yolo2_light_cpu\src\additionally.h(74): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data (compiling source file src\yolov2_forward_network_quantized.c)
+f:\github\yolo2_light_cpu\src\additionally.h(75): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data (compiling source file src\additionally.c)
+f:\github\yolo2_light_cpu\src\additionally.h(74): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data (compiling source file src\main.c)
+f:\github\yolo2_light_cpu\src\additionally.h(77): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data (compiling source file src\yolov2_forward_network.c)
+f:\github\yolo2_light_cpu\src\additionally.h(75): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data (compiling source file src\yolov2_forward_network_quantized.c)
+f:\github\yolo2_light_cpu\src\additionally.h(76): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data (compiling source file src\additionally.c)
+f:\github\yolo2_light_cpu\src\additionally.h(75): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data (compiling source file src\main.c)
+f:\github\yolo2_light_cpu\src\additionally.h(78): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data (compiling source file src\yolov2_forward_network.c)
+f:\github\yolo2_light_cpu\src\additionally.h(76): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data (compiling source file src\yolov2_forward_network_quantized.c)
+f:\github\yolo2_light_cpu\src\additionally.h(77): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data (compiling source file src\additionally.c)
+f:\github\yolo2_light_cpu\src\additionally.h(76): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data (compiling source file src\main.c)
+f:\github\yolo2_light_cpu\src\additionally.h(81): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data (compiling source file src\yolov2_forward_network.c)
+f:\github\yolo2_light_cpu\src\additionally.h(77): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data (compiling source file src\yolov2_forward_network_quantized.c)
+f:\github\yolo2_light_cpu\src\additionally.h(78): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data (compiling source file src\additionally.c)
+f:\github\yolo2_light_cpu\src\additionally.h(77): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data (compiling source file src\main.c)
+f:\github\yolo2_light_cpu\src\additionally.h(82): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data (compiling source file src\yolov2_forward_network.c)
+f:\github\yolo2_light_cpu\src\additionally.h(78): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data (compiling source file src\yolov2_forward_network_quantized.c)
+f:\github\yolo2_light_cpu\src\additionally.h(81): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data (compiling source file src\additionally.c)
+f:\github\yolo2_light_cpu\src\additionally.h(78): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data (compiling source file src\main.c)
+f:\github\yolo2_light_cpu\src\additionally.h(83): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data (compiling source file src\yolov2_forward_network.c)
+f:\github\yolo2_light_cpu\src\additionally.h(81): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data (compiling source file src\yolov2_forward_network_quantized.c)
+f:\github\yolo2_light_cpu\src\additionally.h(82): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data (compiling source file src\additionally.c)
+f:\github\yolo2_light_cpu\src\additionally.h(81): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data (compiling source file src\main.c)
+f:\github\yolo2_light_cpu\src\additionally.h(88): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data (compiling source file src\yolov2_forward_network.c)
+f:\github\yolo2_light_cpu\src\additionally.h(82): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data (compiling source file src\yolov2_forward_network_quantized.c)
+f:\github\yolo2_light_cpu\src\additionally.h(83): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data (compiling source file src\additionally.c)
+f:\github\yolo2_light_cpu\src\additionally.h(82): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data (compiling source file src\main.c)
+f:\github\yolo2_light_cpu\src\additionally.h(89): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data (compiling source file src\yolov2_forward_network.c)
+f:\github\yolo2_light_cpu\src\additionally.h(83): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data (compiling source file src\yolov2_forward_network_quantized.c)
+f:\github\yolo2_light_cpu\src\additionally.h(88): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data (compiling source file src\additionally.c)
+f:\github\yolo2_light_cpu\src\additionally.h(83): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data (compiling source file src\main.c)
+f:\github\yolo2_light_cpu\src\additionally.h(88): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data (compiling source file src\yolov2_forward_network_quantized.c)
+f:\github\yolo2_light_cpu\src\additionally.h(89): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data (compiling source file src\additionally.c)
+f:\github\yolo2_light_cpu\src\additionally.h(88): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data (compiling source file src\main.c)
+f:\github\yolo2_light_cpu\src\additionally.h(89): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data (compiling source file src\yolov2_forward_network_quantized.c)
+f:\github\yolo2_light_cpu\src\additionally.h(89): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data (compiling source file src\main.c)
+src\yolov2_forward_network.c(24): warning C4133: 'function': incompatible types - from 'float *' to 'uint32_t *'
+src\yolov2_forward_network.c(24): warning C4133: 'function': incompatible types - from 'char *' to 'uint32_t *'
+src\yolov2_forward_network.c(24): warning C4267: 'function': conversion from 'size_t' to 'const int', possible loss of data
+src\yolov2_forward_network.c(127): warning C4267: 'function': conversion from 'size_t' to 'int', possible loss of data
+src\yolov2_forward_network_quantized.c(111): warning C4244: '=': conversion from 'double' to 'int', possible loss of data
+src\yolov2_forward_network_quantized.c(138): warning C4244: 'initializing': conversion from 'double' to 'int', possible loss of data
+src\main.c(109): warning C4244: 'initializing': conversion from 'double' to 'int', possible loss of data
+src\yolov2_forward_network_quantized.c(157): warning C4244: 'initializing': conversion from 'double' to 'int', possible loss of data
+src\main.c(128): warning C4244: 'initializing': conversion from 'double' to 'int', possible loss of data
+src\yolov2_forward_network_quantized.c(104): warning C4101: 'i': unreferenced local variable
+src\main.c(129): warning C4244: 'initializing': conversion from 'double' to 'int', possible loss of data
+src\main.c(130): warning C4244: 'initializing': conversion from 'double' to 'int', possible loss of data
+src\main.c(131): warning C4244: 'initializing': conversion from 'double' to 'int', possible loss of data
+src\yolov2_forward_network_quantized.c(554): warning C4133: '=': incompatible types - from 'int *' to 'int8_t *'
+src\main.c(195): warning C4305: '=': truncation from 'double' to 'float'
+src\yolov2_forward_network_quantized.c(558): warning C4244: 'initializing': conversion from 'float' to 'int16_t', possible loss of data
+src\main.c(169): warning C4305: 'initializing': truncation from 'double' to 'float'
+src\yolov2_forward_network_quantized.c(532): warning C4101: 'f': unreferenced local variable
+src\yolov2_forward_network_quantized.c(757): warning C4244: '=': conversion from 'float' to 'conv_t', possible loss of data
+src\yolov2_forward_network_quantized.c(764): warning C4244: '+=': conversion from 'float' to 'conv_t', possible loss of data
+src\yolov2_forward_network_quantized.c(641): warning C4101: 'f': unreferenced local variable
+src\yolov2_forward_network_quantized.c(1042): warning C4013: 'forward_maxpool_layer_cpu' undefined; assuming extern returning int
+src\yolov2_forward_network_quantized.c(1046): warning C4013: 'forward_route_layer_cpu' undefined; assuming extern returning int
+src\yolov2_forward_network_quantized.c(1050): warning C4013: 'forward_reorg_layer_cpu' undefined; assuming extern returning int
+src\yolov2_forward_network_quantized.c(1054): warning C4013: 'forward_upsample_layer_cpu' undefined; assuming extern returning int
+src\yolov2_forward_network_quantized.c(1058): warning C4013: 'forward_shortcut_layer_cpu' undefined; assuming extern returning int
+src\yolov2_forward_network_quantized.c(1062): warning C4013: 'forward_yolo_layer_cpu' undefined; assuming extern returning int
+src\yolov2_forward_network_quantized.c(1066): warning C4013: 'forward_region_layer_cpu' undefined; assuming extern returning int
+src\yolov2_forward_network_quantized.c(1030): warning C4101: 'k': unreferenced local variable
+src\yolov2_forward_network_quantized.c(1150): warning C4244: 'initializing': conversion from 'double' to 'int16_t', possible loss of data
+src\yolov2_forward_network_quantized.c(1095): warning C4101: 'k': unreferenced local variable
+src\yolov2_forward_network_quantized.c(1201): warning C4244: 'initializing': conversion from 'float' to 'int16_t', possible loss of data
+src\yolov2_forward_network_quantized.c(1334): warning C4244: '+=': conversion from 'float' to 'uint64_t', possible loss of data
+src\yolov2_forward_network_quantized.c(1370): warning C4244: '+=': conversion from 'double' to 'float', possible loss of data
+src\yolov2_forward_network_quantized.c(1381): warning C4244: '=': conversion from 'int' to 'float', possible loss of data
+src\yolov2_forward_network_quantized.c(1296): warning C4244: 'initializing': conversion from 'const float' to 'const int', possible loss of data
+src\yolov2_forward_network_quantized.c(1385): warning C4244: 'initializing': conversion from 'double' to 'float', possible loss of data
+src\yolov2_forward_network_quantized.c(1429): warning C4267: 'function': conversion from 'size_t' to 'int', possible loss of data
+src\yolov2_forward_network_quantized.c(1444): warning C4244: 'function': conversion from 'float' to 'int', possible loss of data
+src\yolov2_forward_network_quantized.c(1418): warning C4101: 'k': unreferenced local variable
+src\additionally.c(77): warning C4267: 'initializing': conversion from 'size_t' to 'int', possible loss of data
+src\additionally.c(100): warning C4244: '+=': conversion from 'double' to 'float', possible loss of data
+src\additionally.c(113): warning C4244: '=': conversion from 'int' to 'float', possible loss of data
+src\additionally.c(173): warning C4244: '=': conversion from 'double' to 'float', possible loss of data
+src\additionally.c(182): warning C4267: '=': conversion from 'size_t' to 'int', possible loss of data
+src\additionally.c(187): warning C4267: '=': conversion from 'size_t' to 'int', possible loss of data
+src\additionally.c(245): warning C4244: 'return': conversion from 'unsigned long' to 'uint8_t', possible loss of data
+src\additionally.c(304): warning C4068: unknown pragma
+src\additionally.c(307): warning C4068: unknown pragma
+src\additionally.c(326): warning C4133: 'function': incompatible types - from 'uint32_t *' to 'const unsigned char *const '
+src\additionally.c(326): warning C4133: 'function': incompatible types - from 'uint32_t *' to 'unsigned char *const '
+src\additionally.c(390): warning C4133: 'function': incompatible types - from 'float *' to 'unsigned char *const '
+src\additionally.c(401): warning C4133: 'function': incompatible types - from 'float *' to 'unsigned char *const '
+src\additionally.c(415): warning C4133: 'function': incompatible types - from 'float *' to 'unsigned char *const '
+src\additionally.c(429): warning C4133: 'function': incompatible types - from 'float *' to 'unsigned char *const '
+src\additionally.c(443): warning C4133: 'function': incompatible types - from 'float *' to 'unsigned char *const '
+src\additionally.c(457): warning C4133: 'function': incompatible types - from 'float *' to 'unsigned char *const '
+src\additionally.c(523): warning C4244: 'initializing': conversion from 'DWORD64' to 'int', possible loss of data
+src\additionally.c(545): warning C4101: 'h': unreferenced local variable
+src\additionally.c(657): warning C4267: 'function': conversion from 'size_t' to 'int', possible loss of data
+src\additionally.c(675): warning C4267: 'function': conversion from 'size_t' to 'int', possible loss of data
+src\additionally.c(744): warning C4244: '=': conversion from 'double' to 'float', possible loss of data
+src\additionally.c(820): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data
+src\additionally.c(849): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data
+src\additionally.c(860): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data
+src\additionally.c(888): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data
+src\additionally.c(898): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data
+src\additionally.c(1532): warning C4244: '=': conversion from 'double' to 'float', possible loss of data
+src\additionally.c(1473): warning C4244: 'initializing': conversion from 'double' to 'float', possible loss of data
+src\additionally.c(1729): warning C4244: '=': conversion from 'double' to 'float', possible loss of data
+src\additionally.c(1754): warning C4244: '=': conversion from 'double' to 'float', possible loss of data
+src\additionally.c(1880): warning C4244: 'initializing': conversion from 'double' to 'int', possible loss of data
+src\additionally.c(1881): warning C4244: 'initializing': conversion from 'double' to 'int', possible loss of data
+src\additionally.c(2016): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data
+src\additionally.c(2024): warning C4244: 'return': conversion from 'double' to 'float', possible loss of data
+src\additionally.c(2192): warning C4305: 'function': truncation from 'double' to 'float'
+src\additionally.c(2213): warning C4267: 'initializing': conversion from 'size_t' to 'int', possible loss of data
+src\additionally.c(2220): warning C4244: 'initializing': conversion from 'double' to 'float', possible loss of data
+src\additionally.c(2233): warning C4267: 'initializing': conversion from 'size_t' to 'int', possible loss of data
+src\additionally.c(2270): warning C4305: 'function': truncation from 'double' to 'float'
+src\additionally.c(2279): warning C4267: 'initializing': conversion from 'size_t' to 'int', possible loss of data
+src\additionally.c(2286): warning C4244: 'initializing': conversion from 'double' to 'float', possible loss of data
+src\additionally.c(2372): warning C4267: 'initializing': conversion from 'size_t' to 'int', possible loss of data
+src\additionally.c(2466): warning C4305: 'function': truncation from 'double' to 'float'
+src\additionally.c(2467): warning C4305: 'function': truncation from 'double' to 'float'
+src\additionally.c(2468): warning C4305: 'function': truncation from 'double' to 'float'
+src\additionally.c(2494): warning C4305: 'function': truncation from 'double' to 'float'
+src\additionally.c(2495): warning C4305: 'function': truncation from 'double' to 'float'
+src\additionally.c(2496): warning C4305: 'function': truncation from 'double' to 'float'
+src\additionally.c(2477): warning C4267: 'initializing': conversion from 'size_t' to 'int', possible loss of data
+src\additionally.c(2486): warning C4244: 'initializing': conversion from 'double' to 'float', possible loss of data
+src\additionally.c(2526): warning C4267: 'initializing': conversion from 'size_t' to 'int', possible loss of data
+src\additionally.c(2536): warning C4244: 'initializing': conversion from 'double' to 'float', possible loss of data
+src\additionally.c(2882): warning C4244: '=': conversion from 'double' to 'float', possible loss of data
+src\additionally.c(2883): warning C4244: '=': conversion from 'double' to 'float', possible loss of data
+src\additionally.c(2902): warning C4244: '=': conversion from 'double' to 'float', possible loss of data
+src\additionally.c(2903): warning C4244: '=': conversion from 'double' to 'float', possible loss of data
+src\additionally.c(3141): warning C4244: 'function': conversion from 'time_t' to 'unsigned int', possible loss of data
+src\additionally.c(3160): warning C4305: 'initializing': truncation from 'double' to 'const float'
+src\additionally.c(3161): warning C4305: 'initializing': truncation from 'double' to 'float'
+src\additionally.c(3488): warning C4244: 'function': conversion from 'time_t' to 'unsigned int', possible loss of data
+src\additionally.c(3500): warning C4305: 'initializing': truncation from 'double' to 'const float'
+src\additionally.c(3501): warning C4305: 'initializing': truncation from 'double' to 'float'
+src\additionally.c(3470): warning C4101: 'j': unreferenced local variable
+  Generating code
+  All 226 functions were compiled because no usable IPDB/IOBJ from previous compilation was found.
+  Finished generating code
+  yolo_cpu.vcxproj -> F:\GitHub\yolo2_light_cpu\bin\yolo_cpu.exe
+  yolo_cpu.vcxproj -> bin\yolo_cpu.pdb (Full PDB)
diff --git a/x64/cpu_Release/yolo_cpu.tlog/CL.command.1.tlog b/x64/cpu_Release/yolo_cpu.tlog/CL.command.1.tlog
new file mode 100644
index 0000000..fa0b43e
Binary files /dev/null and b/x64/cpu_Release/yolo_cpu.tlog/CL.command.1.tlog differ
diff --git a/x64/cpu_Release/yolo_cpu.tlog/CL.read.1.tlog b/x64/cpu_Release/yolo_cpu.tlog/CL.read.1.tlog
new file mode 100644
index 0000000..f87400e
Binary files /dev/null and b/x64/cpu_Release/yolo_cpu.tlog/CL.read.1.tlog differ
diff --git a/x64/cpu_Release/yolo_cpu.tlog/CL.write.1.tlog b/x64/cpu_Release/yolo_cpu.tlog/CL.write.1.tlog
new file mode 100644
index 0000000..81b29a4
Binary files /dev/null and b/x64/cpu_Release/yolo_cpu.tlog/CL.write.1.tlog differ
diff --git a/x64/cpu_Release/yolo_cpu.tlog/link.command.1.tlog b/x64/cpu_Release/yolo_cpu.tlog/link.command.1.tlog
new file mode 100644
index 0000000..b162be8
Binary files /dev/null and b/x64/cpu_Release/yolo_cpu.tlog/link.command.1.tlog differ
diff --git a/x64/cpu_Release/yolo_cpu.tlog/link.read.1.tlog b/x64/cpu_Release/yolo_cpu.tlog/link.read.1.tlog
new file mode 100644
index 0000000..8d36393
Binary files /dev/null and b/x64/cpu_Release/yolo_cpu.tlog/link.read.1.tlog differ
diff --git a/x64/cpu_Release/yolo_cpu.tlog/link.write.1.tlog b/x64/cpu_Release/yolo_cpu.tlog/link.write.1.tlog
new file mode 100644
index 0000000..1df0b3f
Binary files /dev/null and b/x64/cpu_Release/yolo_cpu.tlog/link.write.1.tlog differ
diff --git a/x64/cpu_Release/yolo_cpu.tlog/yolo_cpu.lastbuildstate b/x64/cpu_Release/yolo_cpu.tlog/yolo_cpu.lastbuildstate
new file mode 100644
index 0000000..9aaea05
--- /dev/null
+++ b/x64/cpu_Release/yolo_cpu.tlog/yolo_cpu.lastbuildstate
@@ -0,0 +1,2 @@
+#TargetFrameworkVersion=v4.0:PlatformToolSet=v140:EnableManagedIncrementalBuild=false:VCToolArchitecture=Native32Bit:WindowsTargetPlatformVersion=8.1
+Release|x64|F:\GitHub\yolo2_light_cpu\|
diff --git a/x64/cpu_Release/yolov2_forward_network.obj b/x64/cpu_Release/yolov2_forward_network.obj
new file mode 100644
index 0000000..4fdbe88
Binary files /dev/null and b/x64/cpu_Release/yolov2_forward_network.obj differ
diff --git a/x64/cpu_Release/yolov2_forward_network_quantized.obj b/x64/cpu_Release/yolov2_forward_network_quantized.obj
new file mode 100644
index 0000000..b0fcb4a
Binary files /dev/null and b/x64/cpu_Release/yolov2_forward_network_quantized.obj differ
diff --git a/yolo_cpu.vcxproj b/yolo_cpu.vcxproj
index 70fd9f1..4801e1c 100644
--- a/yolo_cpu.vcxproj
+++ b/yolo_cpu.vcxproj
@@ -128,7 +128,7 @@
       <IntrinsicFunctions>true</IntrinsicFunctions>
       <SDLCheck>true</SDLCheck>
       <AdditionalIncludeDirectories>C:\opencv_3.0\opencv\build\include;3rdparty\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>AVX;OPENCV;_TIMESPEC_DEFINED;_CRT_SECURE_NO_WARNINGS;WIN32;NDEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>OPENCV;_TIMESPEC_DEFINED;_CRT_SECURE_NO_WARNINGS;WIN32;NDEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <OpenMPSupport>true</OpenMPSupport>
       <MultiProcessorCompilation>true</MultiProcessorCompilation>
       <UndefinePreprocessorDefinitions>