From 4c73c3b5d7f78c40f69e0c04fd4afb9f48add1e6 Mon Sep 17 00:00:00 2001
From: tbbdev <inteltbbdevelopers@intel.com>
Date: Wed, 6 Dec 2017 20:29:49 +0300
Subject: [PATCH] Committing Intel(R) TBB 2018 Update 2 source code

---
 CHANGES                                       |   26 +
 Makefile                                      |    6 +-
 README.md                                     |    4 +-
 build/android.clang.inc                       |   14 +-
 build/build.py                                |  177 +++
 build/common_rules.inc                        |    7 +
 build/macos.clang.inc                         |    4 +-
 build/macos.gcc.inc                           |    4 +-
 build/macos.icc.inc                           |    3 +-
 build/windows.cl.inc                          |   18 +-
 doc/Release_Notes.txt                         |   10 +-
 doc/html/a00040.html                          |    6 +-
 doc/html/a00361.html                          |    2 +-
 .../msvs/win8ui/copy_libraries_and_assets.bat |    2 +-
 .../tachyon/msvs/win8ui/tbbTachyon.sln        |    4 +-
 .../tachyon/msvs/win8ui/tbbTachyon.vcxproj    |   12 +-
 .../msvs/win8ui/tbbTachyon.vcxproj.filters    |    4 +-
 include/tbb/blocked_range2d.h                 |    2 +-
 include/tbb/blocked_range3d.h                 |    2 +-
 include/tbb/concurrent_hash_map.h             |   26 +-
 include/tbb/flow_graph.h                      |   31 +-
 include/tbb/flow_graph_opencl_node.h          |    8 +-
 include/tbb/internal/_flow_graph_impl.h       |   29 +-
 include/tbb/internal/_flow_graph_trace_impl.h |    7 +-
 include/tbb/partitioner.h                     |   16 +-
 include/tbb/tbb_config.h                      |    3 +-
 include/tbb/tbb_stddef.h                      |    6 +-
 jni/Application.mk                            |   11 +-
 python/Makefile                               |   38 +-
 python/TBB.py                                 |   28 +
 python/index.html                             |   61 +-
 python/rml/Makefile                           |  155 +++
 python/rml/ipc_server.cpp                     | 1119 +++++++++++++++++
 python/rml/ipc_utils.cpp                      |  144 +++
 python/rml/ipc_utils.h                        |   34 +
 python/setup.py                               |   32 +-
 python/tbb/__init__.py                        |  322 +++++
 python/tbb/__main__.py                        |   24 +
 python/{tbb.i => tbb/api.i}                   |  107 +-
 python/{tbb.src.py => tbb/pool.py}            |  217 +---
 python/tbb/test.py                            |  199 +++
 src/Makefile                                  |    4 +-
 src/tbb/arena.cpp                             |   13 +-
 src/tbb/concurrent_vector.cpp                 |    4 +-
 src/tbb/condition_variable.cpp                |   12 +-
 src/tbb/mac32-tbb-export.lst                  |    9 +
 src/tbb/mac64-tbb-export.lst                  |    9 +
 src/tbb/market.cpp                            |    2 +-
 src/tbb/tbb_main.cpp                          |    6 +-
 src/tbb/tbb_main.h                            |    2 +
 src/tbbmalloc/proxy.cpp                       |   29 +-
 src/tbbmalloc/tbb_function_replacement.cpp    |  162 +--
 src/tbbmalloc/tbb_function_replacement.h      |    6 +-
 src/test/harness_allocator.h                  |    6 +-
 src/test/harness_defs.h                       |   12 +-
 src/test/test_blocked_range2d.cpp             |   30 +-
 src/test/test_blocked_range3d.cpp             |   42 +-
 src/test/test_concurrent_hash_map.cpp         |    1 -
 src/test/test_opencl_node.cpp                 |    8 +-
 src/test/test_partitioner.h                   |   18 +
 src/test/test_partitioner_whitebox.cpp        |    2 +
 src/test/test_partitioner_whitebox.h          |   75 ++
 src/test/test_tbb_version.cpp                 |    2 +-
 63 files changed, 2856 insertions(+), 522 deletions(-)
 create mode 100644 build/build.py
 create mode 100644 python/TBB.py
 create mode 100644 python/rml/Makefile
 create mode 100644 python/rml/ipc_server.cpp
 create mode 100644 python/rml/ipc_utils.cpp
 create mode 100644 python/rml/ipc_utils.h
 create mode 100644 python/tbb/__init__.py
 create mode 100644 python/tbb/__main__.py
 rename python/{tbb.i => tbb/api.i} (58%)
 rename python/{tbb.src.py => tbb/pool.py} (79%)
 create mode 100644 python/tbb/test.py

diff --git a/CHANGES b/CHANGES
index aa7f126fdd..5c8a07b00d 100644
--- a/CHANGES
+++ b/CHANGES
@@ -2,6 +2,32 @@
 The list of most significant changes made over time in
 Intel(R) Threading Building Blocks (Intel(R) TBB).
 
+Intel TBB 2018 Update 2
+TBB_INTERFACE_VERSION == 10002
+
+Changes (w.r.t. Intel TBB 2018 Update 1):
+
+- Added support for Android* NDK r16, macOS* 10.13, Fedora* 26.
+- Binaries for Universal Windows Driver (vc14_uwd) now link with static
+    Microsoft* runtime libraries, and are only available in commercial
+    releases.
+- Extended flow graph documentation with more code samples.
+
+Preview Features:
+
+- Added a Python* module for multi-processing computations in numeric
+    Python* libraries.
+
+Bugs fixed:
+
+- Fixed constructors of concurrent_hash_map to be exception-safe.
+- Fixed auto-initialization in the main thread to be cleaned up at
+    shutdown.
+- Fixed a crash when tbbmalloc_proxy is used together with dbghelp.
+- Fixed static_partitioner to assign tasks properly in case of nested
+    parallelism.
+
+------------------------------------------------------------------------
 Intel TBB 2018 Update 1
 TBB_INTERFACE_VERSION == 10001
 
diff --git a/Makefile b/Makefile
index 40ca412a43..4efc537a9c 100644
--- a/Makefile
+++ b/Makefile
@@ -50,13 +50,11 @@ rml: mkdir
 	$(MAKE) -C "$(work_dir)_debug"  -r -f $(tbb_root)/build/Makefile.rml cfg=debug
 	$(MAKE) -C "$(work_dir)_release"  -r -f $(tbb_root)/build/Makefile.rml cfg=release
 
-
 examples: tbb tbbmalloc
 	$(MAKE) -C examples -r -f Makefile tbb_root=.. release test
 
-python: mkdir
-	$(MAKE) -C "$(work_dir)_release"  -r -f $(tbb_root)/build/Makefile.tbb cfg=release
-	bash -c ". $(work_dir)_release$(SLASH)tbbvars.sh && $(MAKE) -rC '$(full_tbb_root)/python' CXX=$(compiler) install test-install"
+python: tbb
+	$(MAKE) -C "$(work_dir)_release" -rf $(tbb_root)/python/Makefile install
 
 .PHONY: clean clean_examples mkdir info
 
diff --git a/README.md b/README.md
index 3e863874ee..2678e2e5aa 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
-# Intel(R) Threading Building Blocks 2018 Update 1
-[![Stable release](https://img.shields.io/badge/version-2018_U1-green.svg)](https://github.com/01org/tbb/releases/tag/2018_U1)
+# Intel(R) Threading Building Blocks 2018 Update 2
+[![Stable release](https://img.shields.io/badge/version-2018_U2-green.svg)](https://github.com/01org/tbb/releases/tag/2018_U2)
 [![Apache License Version 2.0](https://img.shields.io/badge/license-Apache_2.0-green.svg)](LICENSE)
 
 Intel(R) Threading Building Blocks (Intel(R) TBB) lets you easily write parallel C++ programs that take
diff --git a/build/android.clang.inc b/build/android.clang.inc
index a935968c9c..667c21a4a4 100644
--- a/build/android.clang.inc
+++ b/build/android.clang.inc
@@ -68,8 +68,20 @@ ifeq (0, $(dynamic_load))
 endif
 
 # Paths to the NDK prebuilt tools and libraries
-CPLUS_FLAGS    += --sysroot=$(SYSROOT)
+ifneq (,$(findstring $(ndk_version),r16 r16b))
+    # Since Android* NDK r16 another sysroot and isystem paths have to be specified
+    CPLUS_FLAGS += --sysroot=$(NDK_ROOT)/sysroot -isystem $(NDK_ROOT)/sysroot/usr/include/$(TRIPLE) 
+    # Android* version flag required since r16
+    CPLUS_FLAGS += -D__ANDROID_API__=$(API_LEVEL)
+else
+    CPLUS_FLAGS += --sysroot=$(SYSROOT)
+endif
+
+# Library sysroot flag
 LIB_LINK_FLAGS += --sysroot=$(SYSROOT)
+# Flag for test executables
+LINK_FLAGS     += --sysroot=$(SYSROOT)
+
 LIBS           = -L$(CPLUS_LIB_PATH) -lc++_shared
 ifeq (,$(findstring $(ndk_version),$(foreach v, 7 8 9 10 11,r$(v) r$(v)b r$(v)c r$(v)d r$(v)e)))
     LIBS +=  -lc++abi
diff --git a/build/build.py b/build/build.py
new file mode 100644
index 0000000000..7fbf21a100
--- /dev/null
+++ b/build/build.py
@@ -0,0 +1,177 @@
+#!/usr/bin/env python
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#
+#
+#
+
+# Provides unified tool for preparing TBB for packaging
+
+from __future__ import print_function
+import os
+import re
+import sys
+import shutil
+import platform
+import argparse
+from glob import glob
+from collections import OrderedDict
+
+jp = os.path.join
+is_win = (platform.system() == 'Windows')
+is_lin = (platform.system() == 'Linux')
+is_mac = (platform.system() == 'Darwin')
+
+default_prefix = os.getenv('PREFIX', 'install_prefix')
+if is_win:
+    default_prefix = jp(default_prefix, 'Library') # conda-specific by default on Windows
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--tbbroot',       default='.', help='Take Intel TBB from here')
+parser.add_argument('--prefix',        default=default_prefix, help='Prefix')
+parser.add_argument('--no-rebuild',    default=False, action='store_true', help='do not rebuild')
+parser.add_argument('--install',       default=False, action='store_true', help='install all')
+parser.add_argument('--install-libs',  default=False, action='store_true', help='install libs')
+parser.add_argument('--install-devel', default=False, action='store_true', help='install devel')
+parser.add_argument('--install-docs',  default=False, action='store_true', help='install docs')
+parser.add_argument('--install-python',default=False, action='store_true', help='install python module')
+parser.add_argument('--make-tool',     default='make', help='Use different make command instead')
+parser.add_argument('--copy-tool',     default=None, help='Use this command for copying ($ tool file dest-dir)')
+parser.add_argument('--build-args',    default="", help='specify extra build args')
+parser.add_argument('--build-prefix',  default='local', help='build dir prefix')
+if is_win:
+    parser.add_argument('--msbuild',   default=False, action='store_true', help='Use msbuild')
+    parser.add_argument('--vs',          default="2012", help='select VS version for build')
+    parser.add_argument('--vs-platform', default="x64",  help='select VS platform for build')
+parser.add_argument('ignore', nargs='?', help="workaround conda-build issue #2512")
+
+args = parser.parse_args()
+
+if args.install:
+    args.install_libs  = True
+    args.install_devel = True
+    args.install_docs  = True
+    args.install_python= True
+
+def custom_cp(src, dst):
+     assert os.system(' '.join([args.copy_tool, src, dst])) == 0
+
+if args.copy_tool:
+    install_cp = custom_cp # e.g. to use install -p -D -m 755 on Linux
+else:
+    install_cp = shutil.copy
+
+bin_dir = jp(args.prefix, "bin")
+lib_dir = jp(args.prefix, "lib")
+inc_dir = jp(args.prefix, 'include')
+doc_dir = jp(args.prefix, 'share', 'doc', 'tbb')
+if is_win:
+    os.environ["OS"] = "Windows_NT" # make sure TBB will interpret it corretly
+    libext = '.dll'
+    libpref = ''
+    dll_dir = bin_dir
+else:
+    libext = '.dylib' if is_mac else '.so.2'
+    libpref = 'lib'
+    dll_dir = lib_dir
+
+tbb_names = ["tbb", "tbbmalloc", "tbbmalloc_proxy"]
+
+##############################################################
+
+os.chdir(args.tbbroot)
+if is_win and args.msbuild:
+    preview_release_dir = release_dir = jp(args.tbbroot, 'build', 'vs'+args.vs, args.vs_platform, 'Release')
+    if not args.no_rebuild or not os.path.isdir(release_dir):
+        assert os.system('msbuild /m /p:Platform=%s /p:Configuration=Release %s build/vs%s/makefile.sln'% \
+                        (args.vs_platform, args.build_args, args.vs)) == 0
+    preview_debug_dir = debug_dir = jp(args.tbbroot, 'build', 'vs'+args.vs, args.vs_platform, 'Debug')
+    if not args.no_rebuild or not os.path.isdir(debug_dir):
+        assert os.system('msbuild /m /p:Platform=%s /p:Configuration=Debug %s build/vs%s/makefile.sln'% \
+                        (args.vs_platform, args.build_args, args.vs)) == 0
+else:
+    release_dir = jp(args.tbbroot, 'build', args.build_prefix+'_release')
+    debug_dir = jp(args.tbbroot, 'build', args.build_prefix+'_debug')
+    if not args.no_rebuild or not (os.path.isdir(release_dir) and os.path.isdir(debug_dir)):
+        assert os.system('%s -j tbb_build_prefix=%s %s'% \
+                        (args.make_tool, args.build_prefix, args.build_args)) == 0
+    preview_release_dir = jp(args.tbbroot, 'build', args.build_prefix+'_preview_release')
+    preview_debug_dir = jp(args.tbbroot, 'build', args.build_prefix+'_preview_debug')
+    if not args.no_rebuild or not (os.path.isdir(preview_release_dir) and os.path.isdir(preview_debug_dir)):
+        assert os.system('%s -j tbb_build_prefix=%s_preview %s tbb_cpf=1 tbb'% \
+                        (args.make_tool, args.build_prefix, args.build_args)) == 0
+
+
+filemap = OrderedDict()
+def append_files(files, dst):
+    global filemap
+    filemap.update(dict(zip(files, [dst]*len(files))))
+
+if args.install_libs:
+    files = [jp(release_dir, libpref+f+libext) for f in tbb_names]
+    append_files(files, dll_dir)
+
+if args.install_devel:
+    dll_files = [jp(debug_dir, libpref+f+'_debug'+libext) for f in tbb_names] # adding debug libraries
+    if not is_win or not args.msbuild:
+        dll_files += [jp(preview_release_dir, libpref+"tbb_preview"+libext),
+                      jp(preview_debug_dir, libpref+"tbb_preview_debug"+libext)]
+    if is_win:
+        dll_files += sum( [glob(jp(d, 'tbb*.pdb')) for d in             # copying debug info
+                          (release_dir, debug_dir, preview_release_dir, preview_debug_dir)], [])
+    if is_lin:
+        dll_files += sum( [glob(jp(d, 'libtbb*.so')) for d in           # copying linker scripts
+                          (release_dir, debug_dir, preview_release_dir, preview_debug_dir)], [])
+        # symlinks .so -> .so.2 should not be created instead
+        # since linking with -ltbb when using links can result in
+        # incorrect dependence upon unversioned .so files
+    append_files(dll_files, dll_dir)
+    if is_win:
+        lib_files = sum([glob(jp(d,e)) for d in (release_dir, debug_dir) for e in ('*.lib', '*.def')], [])
+        append_files(lib_files, lib_dir) # copying linker libs and defs
+    for rootdir, dirnames, filenames in os.walk(jp(args.tbbroot,'include')):
+        files = [jp(rootdir, f) for f in filenames if not '.html' in f]
+        append_files(files, jp(inc_dir, rootdir.split('include')[1][1:]))
+
+if args.install_python:
+    assert os.system('%s -j tbb_build_prefix=%s %s python'% \
+                    (args.make_tool, args.build_prefix, args.build_args)) == 0
+    if is_lin:
+        append_files([jp(release_dir, 'libirml.so.1')], dll_dir)
+
+if args.install_docs:
+    files = [jp(args.tbbroot, *f) for f in (
+            ('CHANGES',),
+            ('LICENSE',),
+            ('README',),
+            ('README.md',),
+            ('doc','Release_Notes.txt'),
+            )]
+    append_files(files, doc_dir)
+
+for f in filemap.keys():
+    assert os.path.exists(f)
+    assert os.path.isfile(f)
+
+if filemap:
+    print("Copying to prefix =", args.prefix)
+for f, dest in filemap.items():
+    if not os.path.isdir(dest):
+        os.makedirs(dest)
+    print("+ %s to $prefix%s"%(f,dest.replace(args.prefix, '')))
+    install_cp(f, dest)
+
+print("done")
diff --git a/build/common_rules.inc b/build/common_rules.inc
index 3922413df3..809d5c25c8 100644
--- a/build/common_rules.inc
+++ b/build/common_rules.inc
@@ -63,7 +63,14 @@ ifeq ($(origin LIB_LINK_LIBS), undefined)
     LIB_LINK_LIBS = $(LIBDL) $(LIBS)
 endif
 
+# Define C & C++ compilers according to platform defaults or CXX & CC environment variables
+ifneq (,$(findstring environment, $(origin CXX)))
+CPLUS = $(CXX)
+endif
 CONLY ?= $(CPLUS)
+ifneq (,$(findstring environment, $(origin CC)))
+CONLY = $(CC)
+endif
 
 # The most generic rules
 #$(1) - is the target pattern
diff --git a/build/macos.clang.inc b/build/macos.clang.inc
index 3237705343..63ee41e2d9 100644
--- a/build/macos.clang.inc
+++ b/build/macos.clang.inc
@@ -44,7 +44,7 @@ else
     CPLUS_FLAGS = -g -O0 -DTBB_USE_DEBUG
 endif
 
-CPLUS_FLAGS += -DUSE_PTHREAD
+CPLUS_FLAGS += -DUSE_PTHREAD $(ITT_NOTIFY)
 
 # For Clang, we add the option to support RTM intrinsics *iff* xtest is found in <immintrin.h>
 ifneq (,$(shell grep xtest `echo "\#include<immintrin.h>" | clang -E -M - 2>&1 | grep immintrin.h` 2>/dev/null))
@@ -57,12 +57,14 @@ ifneq (,$(stdlib))
 endif
 
 ifeq (intel64,$(arch))
+    ITT_NOTIFY = -DDO_ITT_NOTIFY
     CPLUS_FLAGS += -m64 $(RTM_KEY)
     LINK_FLAGS += -m64
     LIB_LINK_FLAGS += -m64
 endif
 
 ifeq (ia32,$(arch))
+    ITT_NOTIFY = -DDO_ITT_NOTIFY
     CPLUS_FLAGS += -m32 $(RTM_KEY)
     LINK_FLAGS += -m32
     LIB_LINK_FLAGS += -m32
diff --git a/build/macos.gcc.inc b/build/macos.gcc.inc
index 65c7047cb1..3890e9fdcc 100644
--- a/build/macos.gcc.inc
+++ b/build/macos.gcc.inc
@@ -44,15 +44,17 @@ else
     CPLUS_FLAGS = -g -O0 -DTBB_USE_DEBUG
 endif
 
-CPLUS_FLAGS += -DUSE_PTHREAD
+CPLUS_FLAGS += -DUSE_PTHREAD $(ITT_NOTIFY)
 
 ifeq (intel64,$(arch))
+    ITT_NOTIFY = -DDO_ITT_NOTIFY
     CPLUS_FLAGS += -m64
     LINK_FLAGS += -m64
     LIB_LINK_FLAGS += -m64
 endif
 
 ifeq (ia32,$(arch))
+    ITT_NOTIFY = -DDO_ITT_NOTIFY
     CPLUS_FLAGS += -m32
     LINK_FLAGS += -m32
     LIB_LINK_FLAGS += -m32
diff --git a/build/macos.icc.inc b/build/macos.icc.inc
index 92a339b29c..c7dafe6f34 100644
--- a/build/macos.icc.inc
+++ b/build/macos.icc.inc
@@ -58,7 +58,8 @@ else
     CPLUS_FLAGS = -g -O0 -DTBB_USE_DEBUG
 endif
 
-CPLUS_FLAGS += -DUSE_PTHREAD
+ITT_NOTIFY = -DDO_ITT_NOTIFY
+CPLUS_FLAGS += -DUSE_PTHREAD $(ITT_NOTIFY)
 
 ifneq (,$(codecov))
     CPLUS_FLAGS += -prof-gen=srcpos
diff --git a/build/windows.cl.inc b/build/windows.cl.inc
index 3906d08676..bfc7d2648b 100644
--- a/build/windows.cl.inc
+++ b/build/windows.cl.inc
@@ -43,6 +43,11 @@ else
 endif
 EH_FLAGS = $(if $(no_exceptions),/EHs-,/EHsc /GR)
 
+# UWD binaries have to use static CRT linkage
+ifeq ($(target_app), uwd)
+    MS_CRT_KEY = /MT$(if $(findstring debug,$(cfg)),d)
+endif
+
 ifeq ($(cfg), release)
         CPLUS_FLAGS = $(MS_CRT_KEY) /O2 /Zi $(EH_FLAGS) /Zc:forScope /Zc:wchar_t /D__TBB_LIB_NAME=$(TBB.LIB)
         ASM_FLAGS =
@@ -54,17 +59,20 @@ endif
 
 ZW_KEY = /ZW:nostdlib
 
-ifneq (,$(filter win8ui,$(target_app) $(target_ui)))
-        CPLUS_FLAGS += $(ZW_KEY) /D "_UNICODE" /D "UNICODE" /D "WINAPI_FAMILY=WINAPI_FAMILY_APP"
+# These flags are general for Windows* universal applications
+ifneq (,$(target_app))
+    CPLUS_FLAGS += $(ZW_KEY) /D "_UNICODE" /D "UNICODE" /D "WINAPI_FAMILY=WINAPI_FAMILY_APP"
+endif
+
+ifeq ($(target_app), win8ui)
         _WIN32_WINNT = 0x0602
-else ifneq (,$(filter uwp,$(target_app) $(target_ui)))
-        CPLUS_FLAGS += $(ZW_KEY) /D "_UNICODE" /D "UNICODE" /D "WINAPI_FAMILY=WINAPI_FAMILY_APP"
+else ifneq (,$(filter $(target_app),uwp uwd))
         _WIN32_WINNT = 0x0A00
         LIB_LINK_FLAGS += /NODEFAULTLIB:kernel32.lib OneCore.lib
 else
         CPLUS_FLAGS += /DDO_ITT_NOTIFY
 endif
-ifneq (,$(filter store,$(target_mode) $(target_ui_mode)))
+ifeq ($(target_mode), store)
 #       it is necessary to source vcvars with 'store' argument in production
         LIB_LINK_FLAGS += /APPCONTAINER
 endif
diff --git a/doc/Release_Notes.txt b/doc/Release_Notes.txt
index 2c352ebdc7..8ed04a1b24 100644
--- a/doc/Release_Notes.txt
+++ b/doc/Release_Notes.txt
@@ -66,7 +66,7 @@ Software - Supported Operating Systems
     Systems with Linux* operating systems
         CentOS 7.1
         Debian* 8, 9
-        Fedora* 24, 25
+        Fedora* 24, 25, 26
         Intel(R) Cluster Ready
         Red Hat* Enterprise Linux* 6, 7
         SuSE* Linux* Enterprise Server 11, 12
@@ -75,9 +75,9 @@ Software - Supported Operating Systems
         Yocto 2.2, 2.3
     Systems with OS X* or macOS* operating systems
         OS X* 10.10, 10.11
-        macOS* 10.12
+        macOS* 10.12, 10.13
     Systems with Android* operating systems
-        Android* 5.x, 6.x, 7.x
+        Android* 5.x, 6.x, 7.x, 8.x
 
 Software - Supported Compilers
 
@@ -94,8 +94,8 @@ Software - Supported Compilers
         version provided with that operating system is supported
             GNU Compilers (gcc) 4.1 - 7.1
             GNU C Library (glibc) version 2.4 - 2.19
-    Xcode* 6.3 - 8.3
-    Android* NDK r10e - r15b
+    Xcode* 6.3 - 9.1
+    Android* NDK r10e - r16
 
 Software - Supported Performance Analysis Tools
 
diff --git a/doc/html/a00040.html b/doc/html/a00040.html
index 65836598ca..c92ed3a67c 100644
--- a/doc/html/a00040.html
+++ b/doc/html/a00040.html
@@ -349,10 +349,10 @@
 void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="a00040.html#a0bc3593f82b3b4f9839fc051780212ab">internal_copy</a> (const <a class="el" href="a00040.html">concurrent_hash_map</a> &amp;source)</td></tr>
 <tr class="memdesc:a0bc3593f82b3b4f9839fc051780212ab"><td class="mdescLeft">&#160;</td><td class="mdescRight">Copy "source" to *this, where *this must start out empty. <br/></td></tr>
 <tr class="separator:a0bc3593f82b3b4f9839fc051780212ab"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a31e69cc4fd5adafb379e7c6de5660912"><td class="memTemplParams" colspan="2"><a class="anchor" id="a31e69cc4fd5adafb379e7c6de5660912"></a>
+<tr class="memitem:a76750c61ad670e130850148543f67147"><td class="memTemplParams" colspan="2"><a class="anchor" id="a76750c61ad670e130850148543f67147"></a>
 template&lt;typename I &gt; </td></tr>
-<tr class="memitem:a31e69cc4fd5adafb379e7c6de5660912"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><b>internal_copy</b> (I first, I last)</td></tr>
-<tr class="separator:a31e69cc4fd5adafb379e7c6de5660912"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a76750c61ad670e130850148543f67147"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><b>internal_copy</b> (I first, I last, size_type reserve_size)</td></tr>
+<tr class="separator:a76750c61ad670e130850148543f67147"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:acbb5b18c097fb32f264b7b5fe8d9fdd4"><td class="memItemLeft" align="right" valign="top">const_pointer&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="a00040.html#acbb5b18c097fb32f264b7b5fe8d9fdd4">internal_fast_find</a> (const Key &amp;key) const </td></tr>
 <tr class="memdesc:acbb5b18c097fb32f264b7b5fe8d9fdd4"><td class="mdescLeft">&#160;</td><td class="mdescRight">Fast find when no concurrent erasure is used. For internal use inside TBB only!  <a href="#acbb5b18c097fb32f264b7b5fe8d9fdd4">More...</a><br/></td></tr>
 <tr class="separator:acbb5b18c097fb32f264b7b5fe8d9fdd4"><td class="memSeparator" colspan="2">&#160;</td></tr>
diff --git a/doc/html/a00361.html b/doc/html/a00361.html
index 9edce96c2f..72ba936c39 100644
--- a/doc/html/a00361.html
+++ b/doc/html/a00361.html
@@ -101,7 +101,7 @@
   <tr bgcolor="#f0f0f0" class="even"><td class="entry"><b>internal::hash_map_iterator</b> (defined in <a class="el" href="a00040.html">tbb::interface5::concurrent_hash_map&lt; Key, T, HashCompare, A &gt;</a>)</td><td class="entry"><a class="el" href="a00040.html">tbb::interface5::concurrent_hash_map&lt; Key, T, HashCompare, A &gt;</a></td><td class="entry"><span class="mlabel">friend</span></td></tr>
   <tr bgcolor="#f0f0f0"><td class="entry"><b>internal::hash_map_range</b> (defined in <a class="el" href="a00040.html">tbb::interface5::concurrent_hash_map&lt; Key, T, HashCompare, A &gt;</a>)</td><td class="entry"><a class="el" href="a00040.html">tbb::interface5::concurrent_hash_map&lt; Key, T, HashCompare, A &gt;</a></td><td class="entry"><span class="mlabel">friend</span></td></tr>
   <tr class="even"><td class="entry"><a class="el" href="a00040.html#a0bc3593f82b3b4f9839fc051780212ab">internal_copy</a>(const concurrent_hash_map &amp;source)</td><td class="entry"><a class="el" href="a00040.html">tbb::interface5::concurrent_hash_map&lt; Key, T, HashCompare, A &gt;</a></td><td class="entry"><span class="mlabel">protected</span></td></tr>
-  <tr bgcolor="#f0f0f0"><td class="entry"><b>internal_copy</b>(I first, I last) (defined in <a class="el" href="a00040.html">tbb::interface5::concurrent_hash_map&lt; Key, T, HashCompare, A &gt;</a>)</td><td class="entry"><a class="el" href="a00040.html">tbb::interface5::concurrent_hash_map&lt; Key, T, HashCompare, A &gt;</a></td><td class="entry"><span class="mlabel">protected</span></td></tr>
+  <tr bgcolor="#f0f0f0"><td class="entry"><b>internal_copy</b>(I first, I last, size_type reserve_size) (defined in <a class="el" href="a00040.html">tbb::interface5::concurrent_hash_map&lt; Key, T, HashCompare, A &gt;</a>)</td><td class="entry"><a class="el" href="a00040.html">tbb::interface5::concurrent_hash_map&lt; Key, T, HashCompare, A &gt;</a></td><td class="entry"><span class="mlabel">protected</span></td></tr>
   <tr class="even"><td class="entry"><a class="el" href="a00040.html#a8603f5288db63ec35a16844427e97e42">internal_equal_range</a>(const Key &amp;key, I end) const </td><td class="entry"><a class="el" href="a00040.html">tbb::interface5::concurrent_hash_map&lt; Key, T, HashCompare, A &gt;</a></td><td class="entry"><span class="mlabel">protected</span></td></tr>
   <tr><td class="entry"><a class="el" href="a00040.html#acbb5b18c097fb32f264b7b5fe8d9fdd4">internal_fast_find</a>(const Key &amp;key) const </td><td class="entry"><a class="el" href="a00040.html">tbb::interface5::concurrent_hash_map&lt; Key, T, HashCompare, A &gt;</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">protected</span></td></tr>
   <tr bgcolor="#f0f0f0" class="even"><td class="entry"><b>is_write_access_needed</b> (defined in <a class="el" href="a00040.html">tbb::interface5::concurrent_hash_map&lt; Key, T, HashCompare, A &gt;</a>)</td><td class="entry"><a class="el" href="a00040.html">tbb::interface5::concurrent_hash_map&lt; Key, T, HashCompare, A &gt;</a></td><td class="entry"><span class="mlabel">friend</span></td></tr>
diff --git a/examples/parallel_for/tachyon/msvs/win8ui/copy_libraries_and_assets.bat b/examples/parallel_for/tachyon/msvs/win8ui/copy_libraries_and_assets.bat
index 1322d0fd5f..9f7216803b 100644
--- a/examples/parallel_for/tachyon/msvs/win8ui/copy_libraries_and_assets.bat
+++ b/examples/parallel_for/tachyon/msvs/win8ui/copy_libraries_and_assets.bat
@@ -54,7 +54,7 @@ copy "%TBBROOT%\%interim_path%\%vc_dir%\tbbmalloc%postfix%.pdb" "%output_dir%"
 copy "%TBBROOT%\%interim_lib_path%\%vc_dir%\tbb%postfix%.lib" "%output_dir%"
 
 :: Copying DAT-file
-echo Using DAT-file %dat_file% 
+echo Using DAT-file %dat_file%
 if exist %dat_file% copy %dat_file% "%output_dir%\Assets\balls.dat"
 
 goto end
diff --git a/examples/parallel_for/tachyon/msvs/win8ui/tbbTachyon.sln b/examples/parallel_for/tachyon/msvs/win8ui/tbbTachyon.sln
index a7372953a8..ed64646af5 100644
--- a/examples/parallel_for/tachyon/msvs/win8ui/tbbTachyon.sln
+++ b/examples/parallel_for/tachyon/msvs/win8ui/tbbTachyon.sln
@@ -1,6 +1,8 @@
 ﻿
 Microsoft Visual Studio Solution File, Format Version 12.00
-# Visual Studio 2012
+# Visual Studio 2013
+VisualStudioVersion = 12.0.40629.0
+MinimumVisualStudioVersion = 10.0.40219.1
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "tbbTachyon", "tbbTachyon.vcxproj", "{E20CB432-6730-4021-A372-1C81A333518A}"
 EndProject
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{E6DDEA39-7910-47F9-A0E3-56AD7E62ACBD}"
diff --git a/examples/parallel_for/tachyon/msvs/win8ui/tbbTachyon.vcxproj b/examples/parallel_for/tachyon/msvs/win8ui/tbbTachyon.vcxproj
index 076f67ce9f..43f1c96775 100644
--- a/examples/parallel_for/tachyon/msvs/win8ui/tbbTachyon.vcxproj
+++ b/examples/parallel_for/tachyon/msvs/win8ui/tbbTachyon.vcxproj
@@ -1,5 +1,5 @@
 ﻿<?xml version="1.0" encoding="utf-8"?>
-<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+<Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
   <ItemGroup Label="ProjectConfigurations">
     <ProjectConfiguration Include="Debug|Win32">
       <Configuration>Debug</Configuration>
@@ -30,27 +30,27 @@
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
     <ConfigurationType>Application</ConfigurationType>
     <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v110</PlatformToolset>
+    <PlatformToolset>v120</PlatformToolset>
     <UseIntelTBB>true</UseIntelTBB>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
     <ConfigurationType>Application</ConfigurationType>
     <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v110</PlatformToolset>
+    <PlatformToolset>v120</PlatformToolset>
     <UseIntelTBB>true</UseIntelTBB>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
     <ConfigurationType>Application</ConfigurationType>
     <UseDebugLibraries>false</UseDebugLibraries>
     <WholeProgramOptimization>false</WholeProgramOptimization>
-    <PlatformToolset>v110</PlatformToolset>
+    <PlatformToolset>v120</PlatformToolset>
     <UseIntelTBB>true</UseIntelTBB>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
     <ConfigurationType>Application</ConfigurationType>
     <UseDebugLibraries>false</UseDebugLibraries>
     <WholeProgramOptimization>false</WholeProgramOptimization>
-    <PlatformToolset>v110</PlatformToolset>
+    <PlatformToolset>v120</PlatformToolset>
     <UseIntelTBB>true</UseIntelTBB>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
@@ -236,4 +236,4 @@
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
   </ImportGroup>
-</Project>
\ No newline at end of file
+</Project>
diff --git a/examples/parallel_for/tachyon/msvs/win8ui/tbbTachyon.vcxproj.filters b/examples/parallel_for/tachyon/msvs/win8ui/tbbTachyon.vcxproj.filters
index 56341ace16..54c9254155 100644
--- a/examples/parallel_for/tachyon/msvs/win8ui/tbbTachyon.vcxproj.filters
+++ b/examples/parallel_for/tachyon/msvs/win8ui/tbbTachyon.vcxproj.filters
@@ -1,5 +1,5 @@
 ﻿<?xml version="1.0" encoding="utf-8"?>
-<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+<Project ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
   <ItemGroup>
     <Page Include="Common\StandardStyles.xaml">
       <Filter>Resources\Common</Filter>
@@ -212,4 +212,4 @@
       <Filter>Frontend</Filter>
     </AppxManifest>
   </ItemGroup>
-</Project>
\ No newline at end of file
+</Project>
diff --git a/include/tbb/blocked_range2d.h b/include/tbb/blocked_range2d.h
index 1e3dfd1711..0ba40794ce 100644
--- a/include/tbb/blocked_range2d.h
+++ b/include/tbb/blocked_range2d.h
@@ -55,7 +55,7 @@ class blocked_range2d {
 
     //! True if range is empty
     bool empty() const {
-        // Yes, it is a logical OR here, not AND.
+        // Range is empty if at least one dimension is empty.
         return my_rows.empty() || my_cols.empty();
     }
 
diff --git a/include/tbb/blocked_range3d.h b/include/tbb/blocked_range3d.h
index 5f3b3c860f..1c8b2a8b62 100644
--- a/include/tbb/blocked_range3d.h
+++ b/include/tbb/blocked_range3d.h
@@ -61,7 +61,7 @@ class blocked_range3d {
 
     //! True if range is empty
     bool empty() const {
-        // Yes, it is a logical OR here, not AND.
+        // Range is empty if at least one dimension is empty.
         return my_pages.empty() || my_rows.empty() || my_cols.empty();
     }
 
diff --git a/include/tbb/concurrent_hash_map.h b/include/tbb/concurrent_hash_map.h
index 52e41de56b..f75ec0563b 100644
--- a/include/tbb/concurrent_hash_map.h
+++ b/include/tbb/concurrent_hash_map.h
@@ -770,7 +770,9 @@ class concurrent_hash_map : protected internal::hash_map_base {
     concurrent_hash_map( const concurrent_hash_map &table, const allocator_type &a = allocator_type() )
         : internal::hash_map_base(), my_allocator(a)
     {
+        call_clear_on_leave scope_guard(this);
         internal_copy(table);
+        scope_guard.dismiss();
     }
 
 #if __TBB_CPP11_RVALUE_REF_PRESENT
@@ -789,7 +791,7 @@ class concurrent_hash_map : protected internal::hash_map_base {
             this->swap(table);
         }else{
             call_clear_on_leave scope_guard(this);
-            internal_copy(std::make_move_iterator(table.begin()), std::make_move_iterator(table.end()));
+            internal_copy(std::make_move_iterator(table.begin()), std::make_move_iterator(table.end()), table.size());
             scope_guard.dismiss();
         }
     }
@@ -800,8 +802,9 @@ class concurrent_hash_map : protected internal::hash_map_base {
     concurrent_hash_map( I first, I last, const allocator_type &a = allocator_type() )
         : my_allocator(a)
     {
-        reserve( std::distance(first, last) ); // TODO: load_factor?
-        internal_copy(first, last);
+        call_clear_on_leave scope_guard(this);
+        internal_copy(first, last, std::distance(first, last));
+        scope_guard.dismiss();
     }
 
 #if __TBB_INITIALIZER_LISTS_PRESENT
@@ -809,8 +812,9 @@ class concurrent_hash_map : protected internal::hash_map_base {
     concurrent_hash_map( std::initializer_list<value_type> il, const allocator_type &a = allocator_type() )
         : my_allocator(a)
     {
-        reserve(il.size());
-        internal_copy(il.begin(), il.end());
+        call_clear_on_leave scope_guard(this);
+        internal_copy(il.begin(), il.end(), il.size());
+        scope_guard.dismiss();
     }
 
 #endif //__TBB_INITIALIZER_LISTS_PRESENT
@@ -847,8 +851,7 @@ class concurrent_hash_map : protected internal::hash_map_base {
     //! Assignment
     concurrent_hash_map& operator=( std::initializer_list<value_type> il ) {
         clear();
-        reserve(il.size());
-        internal_copy(il.begin(), il.end());
+        internal_copy(il.begin(), il.end(), il.size());
         return *this;
     }
 #endif //__TBB_INITIALIZER_LISTS_PRESENT
@@ -1073,7 +1076,7 @@ class concurrent_hash_map : protected internal::hash_map_base {
     void internal_copy( const concurrent_hash_map& source );
 
     template<typename I>
-    void internal_copy( I first, I last );
+    void internal_copy( I first, I last, size_type reserve_size );
 
     //! Fast find when no concurrent erasure is used. For internal use inside TBB only!
     /** Return pointer to item with given key, or NULL if no such item exists.
@@ -1429,9 +1432,9 @@ void concurrent_hash_map<Key,T,HashCompare,A>::clear() {
 
 template<typename Key, typename T, typename HashCompare, typename A>
 void concurrent_hash_map<Key,T,HashCompare,A>::internal_copy( const concurrent_hash_map& source ) {
-    reserve( source.my_size ); // TODO: load_factor?
     hashcode_t mask = source.my_mask;
     if( my_mask == mask ) { // optimized version
+        reserve( source.my_size ); // TODO: load_factor?
         bucket *dst = 0, *src = 0;
         bool rehash_required = false;
         for( hashcode_t k = 0; k <= mask; k++ ) {
@@ -1448,12 +1451,13 @@ void concurrent_hash_map<Key,T,HashCompare,A>::internal_copy( const concurrent_h
             }
         }
         if( rehash_required ) rehash();
-    } else internal_copy( source.begin(), source.end() );
+    } else internal_copy( source.begin(), source.end(), source.my_size );
 }
 
 template<typename Key, typename T, typename HashCompare, typename A>
 template<typename I>
-void concurrent_hash_map<Key,T,HashCompare,A>::internal_copy(I first, I last) {
+void concurrent_hash_map<Key,T,HashCompare,A>::internal_copy(I first, I last, size_type reserve_size) {
+    reserve( reserve_size ); // TODO: load_factor?
     hashcode_t m = my_mask;
     for(; first != last; ++first) {
         hashcode_t h = my_hash_compare.hash( (*first).first );
diff --git a/include/tbb/flow_graph.h b/include/tbb/flow_graph.h
index fb839ac184..2672349ef5 100644
--- a/include/tbb/flow_graph.h
+++ b/include/tbb/flow_graph.h
@@ -34,7 +34,6 @@
 #include "internal/_aggregator_impl.h"
 #include "tbb_profiling.h"
 #include "task_arena.h"
-#include "flow_graph_abstractions.h"
 
 #if __TBB_PREVIEW_ASYNC_MSG
 #include <vector>    // std::vector in internal::async_storage
@@ -761,7 +760,15 @@ inline graph::graph(task_group_context& use_this_context) :
     my_is_active = true;
 }
 
-inline  void graph::reserve_wait() {
+inline graph::~graph() {
+    wait_for_all();
+    my_root_task->set_ref_count(0);
+    tbb::task::destroy(*my_root_task);
+    if (own_context) delete my_context;
+    delete my_task_arena;
+}
+
+inline void graph::reserve_wait() {
     if (my_root_task) {
         my_root_task->increment_ref_count();
         tbb::internal::fgt_reserve_wait(this);
@@ -821,12 +828,32 @@ inline void graph::reset( reset_flags f ) {
     my_reset_task_list.clear();
 }
 
+inline graph::iterator graph::begin() { return iterator(this, true); }
+
+inline graph::iterator graph::end() { return iterator(this, false); }
+
+inline graph::const_iterator graph::begin() const { return const_iterator(this, true); }
+
+inline graph::const_iterator graph::end() const { return const_iterator(this, false); }
+
+inline graph::const_iterator graph::cbegin() const { return const_iterator(this, true); }
+
+inline graph::const_iterator graph::cend() const { return const_iterator(this, false); }
+
 #if TBB_PREVIEW_FLOW_GRAPH_TRACE
 inline void graph::set_name(const char *name) {
     tbb::internal::fgt_graph_desc(this, name);
 }
 #endif
 
+inline graph_node::graph_node(graph& g) : my_graph(g) {
+    my_graph.register_node(this);
+}
+
+inline graph_node::~graph_node() {
+    my_graph.remove_node(this);
+}
+
 #include "internal/_flow_graph_node_impl.h"
 
 //! An executable node that acts as a source, i.e. it has no predecessors
diff --git a/include/tbb/flow_graph_opencl_node.h b/include/tbb/flow_graph_opencl_node.h
index bf541a10fd..812726fb15 100644
--- a/include/tbb/flow_graph_opencl_node.h
+++ b/include/tbb/flow_graph_opencl_node.h
@@ -457,7 +457,7 @@ class opencl_async_msg : public async_msg<T> {
     operator const T&() const { return data(); }
 
 protected:
-    // Overridden in this derived class to inform that 
+    // Overridden in this derived class to inform that
     // async calculation chain is over
     void finalize() const __TBB_override {
         receive_if_memory_object(*this);
@@ -530,10 +530,10 @@ class opencl_memory {
 
     opencl_async_msg<void*, Factory> receive(const cl_event *e) {
         opencl_async_msg<void*, Factory> d;
-        if (e) { 
+        if (e) {
             d = opencl_async_msg<void*, Factory>(my_host_ptr, *e);
-        } else { 
-            d = opencl_async_msg<void*, Factory>(my_host_ptr); 
+        } else {
+            d = opencl_async_msg<void*, Factory>(my_host_ptr);
         }
 
         // Concurrent receives are prohibited so we do not worry about synchronization.
diff --git a/include/tbb/internal/_flow_graph_impl.h b/include/tbb/internal/_flow_graph_impl.h
index ffdd57be83..6cb706e842 100644
--- a/include/tbb/internal/_flow_graph_impl.h
+++ b/include/tbb/internal/_flow_graph_impl.h
@@ -206,13 +206,7 @@ class graph : tbb::internal::no_copy, public tbb::flow::graph_proxy {
 
     //! Destroys the graph.
     /** Calls wait_for_all, then destroys the root task and context. */
-    ~graph() {
-        wait_for_all();
-        my_root_task->set_ref_count(0);
-        tbb::task::destroy(*my_root_task);
-        if (own_context) delete my_context;
-        delete my_task_arena;
-    }
+    ~graph();
 
 #if TBB_PREVIEW_FLOW_GRAPH_TRACE
     void set_name(const char *name);
@@ -305,17 +299,17 @@ class graph : tbb::internal::no_copy, public tbb::flow::graph_proxy {
 
     // Graph iterator constructors
     //! start iterator
-    iterator begin() { return iterator(this, true); }
+    iterator begin();
     //! end iterator
-    iterator end() { return iterator(this, false); }
+    iterator end();
     //! start const iterator
-    const_iterator begin() const { return const_iterator(this, true); }
+    const_iterator begin() const;
     //! end const iterator
-    const_iterator end() const { return const_iterator(this, false); }
+    const_iterator end() const;
     //! start const iterator
-    const_iterator cbegin() const { return const_iterator(this, true); }
+    const_iterator cbegin() const;
     //! end const iterator
-    const_iterator cend() const { return const_iterator(this, false); }
+    const_iterator cend() const;
 
     //! return status of graph execution
     bool is_cancelled() { return cancelled; }
@@ -361,12 +355,9 @@ class graph_node : tbb::internal::no_copy {
     graph& my_graph;
     graph_node *next, *prev;
 public:
-    explicit graph_node(graph& g) : my_graph(g) {
-        my_graph.register_node(this);
-    }
-    virtual ~graph_node() {
-        my_graph.remove_node(this);
-    }
+    explicit graph_node(graph& g);
+    
+    virtual ~graph_node();
 
 #if TBB_PREVIEW_FLOW_GRAPH_TRACE
     virtual void set_name(const char *name) = 0;
diff --git a/include/tbb/internal/_flow_graph_trace_impl.h b/include/tbb/internal/_flow_graph_trace_impl.h
index 111810c3b2..328f378a64 100644
--- a/include/tbb/internal/_flow_graph_trace_impl.h
+++ b/include/tbb/internal/_flow_graph_trace_impl.h
@@ -88,7 +88,10 @@ static inline void fgt_internal_create_output_port( void *node, void *p, string_
 template<typename InputType>
 void register_input_port(void *node, tbb::flow::receiver<InputType>* port, string_index name_index) {
     // TODO: Make fgt_internal_create_input_port a function template?
-    fgt_internal_create_input_port( node, port, name_index);
+    // In C++03 dependent name lookup from the template definition context
+    // works only for function declarations with external linkage:
+    // http://www.open-std.org/JTC1/SC22/WG21/docs/cwg_defects.html#561
+    fgt_internal_create_input_port(node, static_cast<void*>(port), name_index);
 }
 
 template < typename PortsTuple, int N >
@@ -239,7 +242,7 @@ static inline void fgt_async_reserve( void *node, void *graph ) {
     itt_region_begin( ITT_DOMAIN_FLOW, node, FLOW_NODE, graph, FLOW_GRAPH, FLOW_NULL );
 }
 
-static inline void fgt_async_commit( void *node, void *graph ) {
+static inline void fgt_async_commit( void *node, void */*graph*/) {
     itt_region_end( ITT_DOMAIN_FLOW, node, FLOW_NODE );
 }
 
diff --git a/include/tbb/partitioner.h b/include/tbb/partitioner.h
index d7ebdbfed4..80006adbb4 100644
--- a/include/tbb/partitioner.h
+++ b/include/tbb/partitioner.h
@@ -50,6 +50,7 @@
 #endif // __TBB_DEFINE_MIC
 
 #include "task.h"
+#include "task_arena.h"
 #include "aligned_space.h"
 #include "atomic.h"
 #include "internal/_template_helpers.h"
@@ -349,16 +350,25 @@ struct proportional_mode : adaptive_mode<Partition> {
 #endif // warning 4127 is back
 };
 
+static size_t get_initial_partition_head() {
+    int current_index = tbb::this_task_arena::current_thread_index();
+    if (current_index == tbb::task_arena::not_initialized)
+        current_index = 0;
+    return size_t(current_index);
+}
+
 //! Provides default linear indexing of partitioner's sequence
 template <typename Partition>
 struct linear_affinity_mode : proportional_mode<Partition> {
     size_t my_head;
+    size_t my_max_affinity;
     using proportional_mode<Partition>::self;
-    linear_affinity_mode() : proportional_mode<Partition>(), my_head(0) {}
+    linear_affinity_mode() : proportional_mode<Partition>(), my_head(get_initial_partition_head()),
+                             my_max_affinity(self().my_divisor) {}
     linear_affinity_mode(linear_affinity_mode &src, split) : proportional_mode<Partition>(src, split())
-        , my_head(src.my_head + src.my_divisor) {}
+        , my_head((src.my_head + src.my_divisor) % src.my_max_affinity), my_max_affinity(src.my_max_affinity) {}
     linear_affinity_mode(linear_affinity_mode &src, const proportional_split& split_obj) : proportional_mode<Partition>(src, split_obj)
-        , my_head(src.my_head + src.my_divisor) {}
+        , my_head((src.my_head + src.my_divisor) % src.my_max_affinity), my_max_affinity(src.my_max_affinity) {}
     void set_affinity( task &t ) {
         if( self().my_divisor )
             t.set_affinity( affinity_id(my_head) + 1 );
diff --git a/include/tbb/tbb_config.h b/include/tbb/tbb_config.h
index 3ac2b3e036..d8e0e20166 100644
--- a/include/tbb/tbb_config.h
+++ b/include/tbb/tbb_config.h
@@ -522,7 +522,8 @@ There are four cases that are supported:
 #endif /* TBB_PREVIEW_FLOW_GRAPH_TRACE */
 
 #ifndef __TBB_ITT_STRUCTURE_API
-#define __TBB_ITT_STRUCTURE_API ( !__TBB_DEFINE_MIC && (__TBB_CPF_BUILD || TBB_PREVIEW_FLOW_GRAPH_TRACE || TBB_PREVIEW_ALGORITHM_TRACE) )
+#define __TBB_ITT_STRUCTURE_API ( (__TBB_CPF_BUILD || TBB_PREVIEW_FLOW_GRAPH_TRACE || TBB_PREVIEW_ALGORITHM_TRACE) \
+                                  && !(__TBB_DEFINE_MIC || __MINGW64__ || __MINGW32__) )
 #endif
 
 #if TBB_USE_EXCEPTIONS && !__TBB_TASK_GROUP_CONTEXT
diff --git a/include/tbb/tbb_stddef.h b/include/tbb/tbb_stddef.h
index b4ee781010..9f7d51b536 100644
--- a/include/tbb/tbb_stddef.h
+++ b/include/tbb/tbb_stddef.h
@@ -26,7 +26,7 @@
 #define TBB_VERSION_MINOR 0
 
 // Engineering-focused interface version
-#define TBB_INTERFACE_VERSION 10001
+#define TBB_INTERFACE_VERSION 10002
 #define TBB_INTERFACE_VERSION_MAJOR TBB_INTERFACE_VERSION/1000
 
 // The oldest major interface version still supported
@@ -155,8 +155,8 @@ namespace tbb {
 
 #if TBB_USE_ASSERT
 
-    //! Assert that x is true.
-    /** If x is false, print assertion failure message.
+    //! Assert that predicate is true.
+    /** If predicate is false, print assertion failure message.
         If the comment argument is not NULL, it is printed as part of the failure message.
         The comment argument has no other effect. */
     #define __TBB_ASSERT(predicate,message) __TBB_ASSERT_RELEASE(predicate,message)
diff --git a/jni/Application.mk b/jni/Application.mk
index b9474c2236..940567b153 100644
--- a/jni/Application.mk
+++ b/jni/Application.mk
@@ -45,20 +45,23 @@ export target?=android
 
 ifeq (ia32,$(arch))
     APP_ABI:=x86
+    export TRIPLE:=i686-linux-android
 else ifeq (intel64,$(arch))
     APP_ABI:=x86_64
+    export TRIPLE:=x86_64-linux-android
 else ifeq (arm,$(arch))
     APP_ABI:=armeabi-v7a
+    export TRIPLE:=arm-linux-androideabi
 else ifeq (arm64,$(arch))
     APP_ABI:=arm64-v8a
+    export TRIPLE:=aarch64-linux-android
 else
     APP_ABI:=$(arch)
 endif
 
-APP_PLATFORM:=android-21
-ifneq ("","$(api_version)")
-    APP_PLATFORM:=$(api_version)
-endif
+api_version?=21
+export API_LEVEL:=$(api_version)
+APP_PLATFORM:=android-$(api_version)
 
 ifeq (clang,$(compiler))
     NDK_TOOLCHAIN_VERSION:=clang
diff --git a/python/Makefile b/python/Makefile
index a444542c62..bf58ddfcae 100644
--- a/python/Makefile
+++ b/python/Makefile
@@ -16,23 +16,33 @@
 #
 #
 
-all: release test
+tbb_root?=..
+include $(tbb_root)/build/common.inc
+.PHONY: all release test install test-install
 
-clean:
-	python setup.py clean
-	-rm -rf build/ tbb_wrap.* _TBB.* *.pyc TBB.py*
+export TBBROOT=$(abspath $(tbb_root))
+SRC=$(tbb_root)/python/*.py $(tbb_root)/python/tbb/*
+PY_SETUP=python $(tbb_root)/python/setup.py
+
+all: install test
 
-release: TBB.py
+clean:
+	$(PY_SETUP) clean -b$(CURDIR)
 
-TBB.py: tbb.i tbb.src.py setup.py
-	python setup.py build_ext -f --inplace
+release: CC=$(compiler)
+release: $(SRC) rml
+	$(PY_SETUP) build -b$(CURDIR) -f check
 
-test: TBB.py
-	python TBB.py test
+install: CC=$(compiler)
+install: $(SRC) rml
+	$(PY_SETUP) build -b$(CURDIR) install
 
-install:
-	python setup.py install
+test:
+	python -m tbb test
 
-test-install:
-	@echo Testing installed module
-	python -m TBB test
+rml:
+ifeq (linux,$(tbb_os))
+	$(MAKE) -C "$(work_dir)_release" -rf $(tbb_root)/python/rml/Makefile cfg=release rml
+rml_%:
+	$(MAKE) -C "$(work_dir)_release" -rf $(tbb_root)/python/rml/Makefile cfg=release $(subst rml_,,$@)
+endif
diff --git a/python/TBB.py b/python/TBB.py
new file mode 100644
index 0000000000..9e162997d1
--- /dev/null
+++ b/python/TBB.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python
+#
+# Copyright (c) 2016-2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#
+#
+#
+
+
+from tbb import *
+from tbb import __all__, __doc__
+
+if __name__ == "__main__":
+    from tbb import _main
+    import sys
+    sys.exit(_main())
diff --git a/python/index.html b/python/index.html
index 480f870b86..a13b6f2298 100644
--- a/python/index.html
+++ b/python/index.html
@@ -4,10 +4,10 @@ <H2>Python* API for Intel&reg; Threading Building Blocks (Intel&reg; TBB).
 </H2>
 
 <H2>Overview</H2>
-It is a preview Python* module which unlocks opportunities for additional performance in multi-threaded Python programs by enabling threading composability
+It is a preview Python* module which unlocks opportunities for additional performance in multi-threaded and multiprocess Python programs by enabling threading composability
 between two or more thread-enabled libraries like Numpy, Scipy, Sklearn, Dask, Joblib, and etc.
 <p></p>
-The biggest improvement can be achieved when a task pool like the ThreadPool from the Python standard library or libraries like Dask or Joblib (used in multi-threading mode)
+The biggest improvement can be achieved when a task pool like the ThreadPool or Pool from the Python standard library or libraries like Dask or Joblib (used either in multi-threading or multi-processing mode)
 execute tasks calling compute-intensive functions of Numpy/Scipy/Sklearn/PyDAAL which in turn are parallelized using Intel&reg; Math Kernel Library or/and Intel&reg; TBB.
 <p></p>
 The module implements Pool class with the standard interface using Intel&reg; TBB which can be used to replace Python's ThreadPool.
@@ -15,43 +15,52 @@ <H2>Overview</H2>
 <p></p>
 For more information and examples, please refer to <A HREF="http://software.intel.com/en-us/blogs/2016/04/04/unleash-parallel-performance-of-python-programs">online blog</A>.
 
+<H2>Directories</H2>
+<DL>
+<DT><A HREF="rml">rml</A>
+<DD>The folder contains sources for building the plugin with cross-process dynamic thread scheduler implementation.
+<DT><A HREF="tbb">tbb</A>
+<DD>The folder contains Python module sources.
+</DL>
+
 <H2>Files</H2>
 <DL>
 <DT><A HREF="setup.py">setup.py</A>
 <DD>Standard Python setup script.
 <DT><A HREF="Makefile">Makefile</A>
-<DD>Makefile for building, installing, and testing. See below.
-<DT><A HREF="tbb.i">tbb.i</A>
-<DD>SWIG interface description file.
-<DT><A HREF="tbb.src.py">tbb.src.py</A>
-<DD>Python part of module implementation.
+<DD>Internal Makefile for building, installing, and testing. See below.
+<DT><A HREF="TBB.py">TBB.py</A>
+<DD>Alternative entry point for Python module.
 </DL>
 
 <A NAME=build><H2>Build and install</H2></A>
-Prior to building it, please set up the environment using corresponding tbbvars script, e.g. `source tbbvars.sh intel64`
+For accessing targets defined in python/Makefile, please use
+<A HREF="../src/index.html">src/Makefile</A>
+instead and build runtime libraries before working with Python.
 <DL>
-<DT><TT>make</TT>
-<DD>Default build and run. Equivalent to 'make release test'.
-<DT><TT>make release</TT>
-<DD>Compile and link against the release version of Intel TBB runtime library. The resulting executable is left in the directory for the example.
-<DT><TT>make test</TT>
-<DD>Run local build of the module previously produced by one of the above commands.
-<DT><TT>make install</TT>
-<DD>Install module into Python.
-<DT><TT>make <B>[</B>(above options or targets)<B>]</B> CXX=<B>{</B>icl, icc<B>}</B></TT>
-<DD>Build and run as above, but use Intel&reg; C++ compiler instead of default, native compilers
-(e.g., icl instead of cl.exe on Windows* systems, or icc instead of g++ on Linux* or macOS* systems).
-Please note, CXX=icl works on Windows only with Intel&reg; Distribution for Python*.
-<DT><TT>make clean</TT>
-<DD>Remove any intermediate files produced by the above commands.
+<DT><TT>make -C ../src python_all</TT>
+<DD>Install and test as described below.
+<DT><TT>make -C ../src python_install</TT>
+<DD>Install module into Python environment.
+<DT><TT>make -C ../src python_test</TT>
+<DD>Test installed Intel&reg; TBB module for Python.
+<DT><TT>make -C ../src python_release</TT>
+<DD>Recompile Python module. Result is located in Intel&reg; TBB build directory.
+<DT><TT>make  -C ../src python_clean</TT>
+<DD>Remove any intermediate files produced by the commands above. Does not remove installed module.
 </DL>
 
 <H2>Command-line interface</H2>
 <DL>
-<DT><TT>pydoc TBB</TT>
-<DD>Read built-in documentation for Python interfaces.
-<DT><TT>python -m TBB your_script.py</TT>
-<DD>Run your_script.py in context of `with TBB.Monkey():` when Intel TBB is enabled.
+<DT><TT>python -m tbb -h</TT>
+<DD>Print documentation on command-line interface</DD>
+<DT><TT>pydoc tbb</TT>
+<DD>Read built-in documentation for Python interfaces.</DD>
+<DT><TT>python-tbb your_script.py</TT>
+<DT><TT>python -m tbb your_script.py</TT>
+<DD>Run your_script.py in context of `with tbb.Monkey():` when Intel&reg; TBB is enabled. By default only multi-threading will be covered.</DD>
+<DT><TT>python -m tbb --ipc your_script.py</TT>
+<DD>Run your_script.py in context of `with tbb.Monkey():` when Intel&reg; TBB enabled in both multi-threading and multi-processing modes.</DD>
 </DL>
 
 <H2>System Requirements</H2>
diff --git a/python/rml/Makefile b/python/rml/Makefile
new file mode 100644
index 0000000000..6ca2d21942
--- /dev/null
+++ b/python/rml/Makefile
@@ -0,0 +1,155 @@
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#
+#
+#
+
+
+.NOTPARALLEL:
+
+tbb_root ?= ../..
+BUILDING_PHASE=0
+TEST_RESOURCE = $(RML.RES)
+include $(tbb_root)/build/common.inc
+DEBUG_SUFFIX=$(findstring _debug,_$(cfg))
+
+ifneq (linux,$(target))
+$(error "IPC RML is supported on Linux only")
+endif
+
+.PHONY: default rml test clean
+
+# default target
+default: rml test
+
+RML_ROOT ?= $(tbb_root)/src/rml
+RML_SERVER_ROOT = $(RML_ROOT)/server
+# TODO: new API needs to be added for this server, exposing everything
+RML.DEF =
+
+VPATH = $(tbb_root)/src/tbb $(tbb_root)/src/tbb/$(ASSEMBLY_SOURCE)
+VPATH += $(tbb_root)/python/rml $(RML_ROOT)/test $(tbb_root)/src/test
+VPATH += $(tbb_root)/src/rml/client
+
+include $(tbb_root)/build/common_rules.inc
+
+#--------------------------------------------------------------------------
+# Define rules for making the RML server shared library and client objects.
+#--------------------------------------------------------------------------
+
+# Object files that make up RML server
+RML_SERVER.OBJ = ipc_server.$(OBJ)
+
+# Object files that RML clients need
+RML_TBB_CLIENT.OBJ ?= ipc_utils.$(OBJ)
+RML.OBJ = $(RML_SERVER.OBJ) $(RML_TBB_CLIENT.OBJ)
+ifeq (windows,$(tbb_os))
+RML_ASM.OBJ = $(if $(findstring intel64,$(arch)),$(TBB_ASM.OBJ))
+endif
+ifeq (linux,$(tbb_os))
+RML_ASM.OBJ = $(if $(findstring ia64,$(arch)),$(TBB_ASM.OBJ))
+endif
+
+RML_TBB_DEP= cache_aligned_allocator_rml.$(OBJ) dynamic_link_rml.$(OBJ) tbb_misc_rml.$(OBJ) tbb_misc_ex_rml.$(OBJ)
+TBB_DEP_NON_RML_TEST?= cache_aligned_allocator_rml.$(OBJ) dynamic_link_rml.$(OBJ) $(RML_ASM.OBJ) tbb_misc_rml.$(OBJ) tbb_misc_ex_rml.$(OBJ)
+ifeq ($(cfg),debug)
+RML_TBB_DEP+= spin_mutex_rml.$(OBJ)
+TBB_DEP_RML_TEST?= $(RML_ASM.OBJ) tbb_misc_rml.$(OBJ)
+else
+TBB_DEP_RML_TEST?= $(RML_ASM.OBJ)
+endif
+LIBS += $(LIBDL)
+TBB_DEP_RML_TEST =  rml_tbb.$(OBJ) dynamic_link_rml.$(OBJ)
+
+INCLUDES += $(INCLUDE_KEY)$(RML_ROOT)/include $(INCLUDE_KEY).
+T_INCLUDES = $(INCLUDES) $(INCLUDE_KEY)$(tbb_root)/src/test $(INCLUDE_KEY)$(RML_SERVER_ROOT)
+
+# Suppress superfluous warnings for RML compilation
+R_CPLUS_FLAGS =  $(subst DO_ITT_NOTIFY,DO_ITT_NOTIFY=0,$(CPLUS_FLAGS)) $(WARNING_SUPPRESS) \
+		 $(DEFINE_KEY)TBB_USE_THREADING_TOOLS=0 $(DEFINE_KEY)__TBB_RML_STATIC=1 $(DEFINE_KEY)__TBB_NO_IMPLICIT_LINKAGE=1
+
+%.$(OBJ): %.cpp
+	$(CPLUS) $(COMPILE_ONLY) $(R_CPLUS_FLAGS) $(PIC_KEY) $(DSE_KEY) $(INCLUDES) $<
+
+tbb_misc_rml.$(OBJ) $(RML_SERVER.OBJ): version_string.ver
+
+RML_TEST.OBJ = test_job_automaton.$(OBJ) test_thread_monitor.$(OBJ) test_rml_tbb.$(OBJ)
+
+$(RML_TBB_DEP): %_rml.$(OBJ): %.cpp
+	$(CPLUS) $(COMPILE_ONLY) $(OUTPUTOBJ_KEY)$@ $(R_CPLUS_FLAGS) $(PIC_KEY) $(DSE_KEY) $(INCLUDES) $<
+
+$(RML_TEST.OBJ): %.$(OBJ): %.cpp
+	$(CPLUS) $(COMPILE_ONLY) $(R_CPLUS_FLAGS) $(PIC_KEY) $(T_INCLUDES) $<
+
+ifneq (,$(RML.DEF))
+rml.def: $(RML.DEF)
+	$(CPLUS) $(PREPROC_ONLY) $< $(CPLUS_FLAGS) $(INCLUDES) > $@
+
+LIB_LINK_FLAGS += $(EXPORT_KEY)rml.def
+$(RML.DLL): rml.def
+endif
+
+$(RML.DLL): CPLUS_FLAGS += $(SDL_FLAGS)
+$(RML.DLL): BUILDING_LIBRARY = $(RML.DLL)
+$(RML.DLL): $(RML_TBB_DEP) $(RML.OBJ) $(RML.RES) $(RML_NO_VERSION.DLL) $(RML_ASM.OBJ)
+	$(LIB_LINK_CMD) $(LIB_OUTPUT_KEY)$(RML.DLL) $(RML.OBJ) $(RML_TBB_DEP) $(RML_ASM.OBJ) $(RML.RES) $(LIB_LINK_LIBS) $(LIB_LINK_FLAGS)
+
+ifneq (,$(RML_NO_VERSION.DLL))
+$(RML_NO_VERSION.DLL):
+	echo "INPUT ($(RML.DLL))" > $(RML_NO_VERSION.DLL)
+endif
+
+rml: rml_dll
+rml_dll: $(RML.DLL)
+
+#------------------------------------------------------
+# End of rules for making the RML server shared library
+#------------------------------------------------------
+
+#------------------------------------------------------
+# Define rules for making the RML unit tests
+#------------------------------------------------------
+
+add_debug=$(basename $(1))_debug$(suffix $(1))
+cross_suffix=$(if $(crosstest),$(if $(DEBUG_SUFFIX),$(subst _debug,,$(1)),$(call add_debug,$(1))),$(1))
+
+RML_TESTS = test_job_automaton.$(TEST_EXT) test_thread_monitor.$(TEST_EXT)
+RML_CUSTOM_TESTS = test_rml_tbb.$(TEST_EXT)
+
+test_rml_tbb.$(TEST_EXT): test_rml_tbb.$(OBJ) $(RML_TBB_CLIENT.OBJ) $(TBB_DEP_RML_TEST)
+	$(CPLUS) $(OUTPUT_KEY)$@ $(CPLUS_FLAGS) test_rml_tbb.$(OBJ) $(RML_TBB_CLIENT.OBJ) $(TBB_DEP_RML_TEST) $(LIBS) $(LINK_FLAGS)
+
+$(RML_TESTS): %.$(TEST_EXT): %.$(OBJ) $(TBB_DEP_NON_RML_TEST)
+	$(CPLUS) $(OUTPUT_KEY)$@ $(CPLUS_FLAGS) $< $(TBB_DEP_NON_RML_TEST) $(LIBS) $(LINK_FLAGS)
+
+export IPC_ENABLE=1
+### run_cmd is usually empty
+test: $(call cross_suffix,$(RML.DLL)) $(TEST_PREREQUISITE) $(RML_TESTS) $(RML_CUSTOM_TESTS)
+	$(run_cmd) ./test_job_automaton.$(TEST_EXT) $(args)
+	$(run_cmd) ./test_thread_monitor.$(TEST_EXT) $(args)
+#TODO:	$(run_cmd) ./test_rml_tbb.$(TEST_EXT) $(args)
+#TODO:	IPC_ENABLE=1 LD_PRELOAD=$(abspath libirml.so.1) $(MAKE) -rf $(tbb_root)/src/Makefile cfg=release tbb_test_release
+
+#------------------------------------------------------
+# End of rules for making the TBBMalloc unit tests
+#------------------------------------------------------
+
+# Include automatically generated dependencies
+-include *.d
+
+clean:
+	-rm -rf *.o *.so* *.d *.def version_string.ver
+	-rm -rf $(work_dir)_release/libirml*
+	-rm -rf $(work_dir)_debug/libirml*
diff --git a/python/rml/ipc_server.cpp b/python/rml/ipc_server.cpp
new file mode 100644
index 0000000000..0c28920306
--- /dev/null
+++ b/python/rml/ipc_server.cpp
@@ -0,0 +1,1119 @@
+/*
+    Copyright (c) 2017 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+
+
+
+*/
+
+#include "rml_tbb.h"
+#include "../server/thread_monitor.h"
+#include "tbb/atomic.h"
+#include "tbb/cache_aligned_allocator.h"
+#include "tbb/scheduler_common.h"
+#include "tbb/governor.h"
+#include "tbb/tbb_misc.h"
+
+#include "ipc_utils.h"
+
+#include <fcntl.h>
+
+namespace rml {
+namespace internal {
+
+static const char* IPC_ENABLE_VAR_NAME = "IPC_ENABLE";
+
+typedef versioned_object::version_type version_type;
+
+extern "C" factory::status_type __RML_open_factory(factory& f, version_type& server_version, version_type client_version) {
+    if( !tbb::internal::rml::get_enable_flag( IPC_ENABLE_VAR_NAME ) ) {
+        return factory::st_incompatible;
+    }
+
+    // Hack to keep this library from being closed
+    static tbb::atomic<bool> one_time_flag;
+    if( one_time_flag.compare_and_swap(true,false)==false ) {
+        __TBB_ASSERT( (size_t)f.library_handle!=factory::c_dont_unload, NULL );
+#if _WIN32||_WIN64
+        f.library_handle = reinterpret_cast<HMODULE>(factory::c_dont_unload);
+#else
+        f.library_handle = reinterpret_cast<void*>(factory::c_dont_unload);
+#endif
+    }
+    // End of hack
+
+    return factory::st_success;
+}
+
+extern "C" void __RML_close_factory(factory& f) {
+}
+
+class ipc_thread_monitor : public thread_monitor {
+public:
+    ipc_thread_monitor() : thread_monitor() {}
+
+#if USE_WINTHREAD
+#elif USE_PTHREAD
+    static handle_type launch(thread_routine_type thread_routine, void* arg, size_t stack_size);
+#endif
+};
+
+#if USE_WINTHREAD
+#elif USE_PTHREAD
+inline ipc_thread_monitor::handle_type ipc_thread_monitor::launch(void* (*thread_routine)(void*), void* arg, size_t stack_size) {
+    pthread_attr_t s;
+    if( pthread_attr_init( &s ) ) return 0;
+    if( stack_size>0 ) {
+        if( pthread_attr_setstacksize( &s, stack_size ) ) return 0;
+    }
+    pthread_t handle;
+    if( pthread_create( &handle, &s, thread_routine, arg ) ) return 0;
+    if( pthread_attr_destroy( &s ) ) return 0;
+    return handle;
+}
+#endif
+
+}} //rml::internal
+
+using rml::internal::ipc_thread_monitor;
+
+namespace tbb {
+namespace internal {
+namespace rml {
+
+typedef ipc_thread_monitor::handle_type thread_handle;
+
+class ipc_server;
+
+static const char* IPC_MAX_THREADS_VAR_NAME = "MAX_THREADS";
+static const char* IPC_ACTIVE_SEM_PREFIX = "/__IPC_active";
+static const char* IPC_STOP_SEM_PREFIX = "/__IPC_stop";
+static const char* IPC_ACTIVE_SEM_VAR_NAME = "IPC_ACTIVE_SEMAPHORE";
+static const char* IPC_STOP_SEM_VAR_NAME = "IPC_STOP_SEMAPHORE";
+static const mode_t IPC_SEM_MODE = 0660;
+
+static tbb::atomic<unsigned> my_global_thread_count;
+
+char* get_active_sem_name() {
+    char* value = getenv( IPC_ACTIVE_SEM_VAR_NAME );
+    if( value!=NULL && strlen( value )>0 ) {
+        char* sem_name = new char[strlen( value ) + 1];
+        __TBB_ASSERT( sem_name!=NULL, NULL );
+        strcpy( sem_name, value );
+        return sem_name;
+    } else {
+        return get_shared_name( IPC_ACTIVE_SEM_PREFIX );
+    }
+}
+
+char* get_stop_sem_name() {
+    char* value = getenv( IPC_STOP_SEM_VAR_NAME );
+    if( value!=NULL && strlen( value )>0 ) {
+        char* sem_name = new char[strlen( value ) + 1];
+        __TBB_ASSERT( sem_name!=NULL, NULL );
+        strcpy( sem_name, value );
+        return sem_name;
+    } else {
+        return get_shared_name( IPC_STOP_SEM_PREFIX );
+    }
+}
+
+static void release_thread_sem(sem_t* my_sem) {
+    int old;
+    do {
+        old = my_global_thread_count;
+        if( old<=0 ) return;
+    } while( my_global_thread_count.compare_and_swap(old-1, old)!=old );
+    if( old>0 ) {
+        sem_post( my_sem );
+    }
+}
+
+extern "C" void set_active_sem_name() {
+    char* templ = new char[strlen( IPC_ACTIVE_SEM_PREFIX ) + strlen( "_XXXXXX" ) + 1];
+    __TBB_ASSERT( templ!=NULL, NULL );
+    strcpy( templ, IPC_ACTIVE_SEM_PREFIX );
+    strcpy( templ + strlen( IPC_ACTIVE_SEM_PREFIX ), "_XXXXXX" );
+    char* sem_name = mktemp( templ );
+    if( sem_name!=NULL ) {
+        int status = setenv( IPC_ACTIVE_SEM_VAR_NAME, sem_name, 1 );
+        __TBB_ASSERT( status==0, NULL );
+    }
+    delete[] templ;
+}
+
+extern "C" void set_stop_sem_name() {
+    char* templ = new char[strlen( IPC_STOP_SEM_PREFIX ) + strlen( "_XXXXXX" ) + 1];
+    __TBB_ASSERT( templ!=NULL, NULL );
+    strcpy( templ, IPC_STOP_SEM_PREFIX );
+    strcpy( templ + strlen( IPC_STOP_SEM_PREFIX ), "_XXXXXX" );
+    char* sem_name = mktemp( templ );
+    if( sem_name!=NULL ) {
+        int status = setenv( IPC_STOP_SEM_VAR_NAME, sem_name, 1 );
+        __TBB_ASSERT( status==0, NULL );
+    }
+    delete[] templ;
+}
+
+extern "C" void release_resources() {
+    if( my_global_thread_count!=0 ) {
+        char* active_sem_name = get_active_sem_name();
+        sem_t* my_active_sem = sem_open( active_sem_name, O_CREAT );
+        __TBB_ASSERT( my_active_sem, "Unable to open active threads semaphore" );
+        delete[] active_sem_name;
+
+        do {
+            release_thread_sem( my_active_sem );
+        } while( my_global_thread_count!=0 );
+    }
+}
+
+extern "C" void release_semaphores() {
+    int status = 0;
+    char* sem_name = NULL;
+
+    sem_name = get_active_sem_name();
+    if( sem_name==NULL ) {
+        runtime_warning("Can not get RML semaphore name");
+        return;
+    }
+    status = sem_unlink( sem_name );
+    if( status!=0 ) {
+        if( errno==ENOENT ) {
+            /* There is no semaphore with the given name, nothing to do */
+        } else {
+            runtime_warning("Can not release RML semaphore");
+            return;
+        }
+    }
+    delete[] sem_name;
+
+    sem_name = get_stop_sem_name();
+    if( sem_name==NULL ) {
+        runtime_warning( "Can not get RML semaphore name" );
+        return;
+    }
+    status = sem_unlink( sem_name );
+    if( status!=0 ) {
+        if( errno==ENOENT ) {
+            /* There is no semaphore with the given name, nothing to do */
+        } else {
+            runtime_warning("Can not release RML semaphore");
+            return;
+        }
+    }
+    delete[] sem_name;
+}
+
+class ipc_worker: no_copy {
+protected:
+    //! State in finite-state machine that controls the worker.
+    /** State diagram:
+                    /----------stop---\
+                    |           ^     |
+                    V           |     |
+        init --> starting --> normal  |
+          |         |           |     |
+          |         V           |     |
+          \------> quit <-------/<----/
+      */
+    enum state_t {
+        //! *this is initialized
+        st_init,
+        //! *this has associated thread that is starting up.
+        st_starting,
+        //! Associated thread is doing normal life sequence.
+        st_normal,
+        //! Associated thread is stopped but can be started again.
+        st_stop,
+        //! Associated thread has ended normal life sequence and promises to never touch *this again.
+        st_quit
+    };
+    atomic<state_t> my_state;
+
+    //! Associated server
+    ipc_server& my_server;
+
+    //! Associated client
+    tbb_client& my_client;
+
+    //! index used for avoiding the 64K aliasing problem
+    const size_t my_index;
+
+    //! Monitor for sleeping when there is no work to do.
+    /** The invariant that holds for sleeping workers is:
+        "my_slack<=0 && my_state==st_normal && I am on server's list of asleep threads" */
+    ipc_thread_monitor my_thread_monitor;
+
+    //! Handle of the OS thread associated with this worker
+    thread_handle my_handle;
+
+    //! Link for list of workers that are sleeping or have no associated thread.
+    ipc_worker* my_next;
+
+    friend class ipc_server;
+
+    //! Actions executed by the associated thread
+    void run();
+
+    //! Wake up associated thread (or launch a thread if there is none)
+    bool wake_or_launch();
+
+    //! Called by a thread (usually not the associated thread) to commence termination.
+    void start_shutdown(bool join);
+
+    //! Called by a thread (usually not the associated thread) to commence stopping.
+    void start_stopping(bool join);
+
+    static __RML_DECL_THREAD_ROUTINE thread_routine(void* arg);
+
+    static void release_handle(thread_handle my_handle, bool join);
+
+protected:
+    ipc_worker(ipc_server& server, tbb_client& client, const size_t i) :
+        my_server(server),
+        my_client(client),
+        my_index(i)
+    {
+        my_state = st_init;
+    }
+};
+
+static const size_t cache_line_size = tbb::internal::NFS_MaxLineSize;
+
+#if _MSC_VER && !defined(__INTEL_COMPILER)
+    // Suppress overzealous compiler warnings about uninstantiable class
+    #pragma warning(push)
+    #pragma warning(disable:4510 4610)
+#endif
+class padded_ipc_worker: public ipc_worker {
+    char pad[cache_line_size - sizeof(ipc_worker)%cache_line_size];
+public:
+    padded_ipc_worker(ipc_server& server, tbb_client& client, const size_t i)
+    : ipc_worker( server,client,i ) { suppress_unused_warning(pad); }
+};
+#if _MSC_VER && !defined(__INTEL_COMPILER)
+    #pragma warning(pop)
+#endif
+
+class ipc_waker : public padded_ipc_worker {
+private:
+    static __RML_DECL_THREAD_ROUTINE thread_routine(void* arg);
+    void run();
+    bool wake_or_launch();
+
+    friend class ipc_server;
+
+public:
+    ipc_waker(ipc_server& server, tbb_client& client, const size_t i)
+    : padded_ipc_worker( server, client, i ) {}
+};
+
+class ipc_stopper : public padded_ipc_worker {
+private:
+    static __RML_DECL_THREAD_ROUTINE thread_routine(void* arg);
+    void run();
+    bool wake_or_launch();
+
+    friend class ipc_server;
+
+public:
+    ipc_stopper(ipc_server& server, tbb_client& client, const size_t i)
+    : padded_ipc_worker( server, client, i ) {}
+};
+
+class ipc_server: public tbb_server, no_copy {
+private:
+    tbb_client& my_client;
+    //! Maximum number of threads to be created.
+    /** Threads are created lazily, so maximum might not actually be reached. */
+    tbb_client::size_type my_n_thread;
+
+    //! Stack size for each thread. */
+    const size_t my_stack_size;
+
+    //! Number of jobs that could use their associated thread minus number of active threads.
+    /** If negative, indicates oversubscription.
+        If positive, indicates that more threads should run.
+        Can be lowered asynchronously, but must be raised only while holding my_asleep_list_mutex,
+        because raising it impacts the invariant for sleeping threads. */
+    atomic<int> my_slack;
+
+    //! Counter used to determine when to delete this.
+    atomic<int> my_ref_count;
+
+    padded_ipc_worker* my_thread_array;
+
+    //! List of workers that are asleep or committed to sleeping until notified by another thread.
+    tbb::atomic<ipc_worker*> my_asleep_list_root;
+
+    //! Protects my_asleep_list_root
+    typedef scheduler_mutex_type asleep_list_mutex_type;
+    asleep_list_mutex_type my_asleep_list_mutex;
+
+    //! Should server wait workers while terminate
+    const bool my_join_workers;
+
+    //! Service thread for waking of workers
+    ipc_waker* my_waker;
+
+    //! Service thread to stop threads
+    ipc_stopper* my_stopper;
+
+    //! Semaphore to account active threads
+    sem_t* my_active_sem;
+
+    //! Semaphore to account stop threads
+    sem_t* my_stop_sem;
+
+#if TBB_USE_ASSERT
+    atomic<int> my_net_slack_requests;
+#endif /* TBB_USE_ASSERT */
+
+    //! Wake up to two sleeping workers, if there are any sleeping.
+    /** The call is used to propagate a chain reaction where each thread wakes up two threads,
+        which in turn each wake up two threads, etc. */
+    void propagate_chain_reaction() {
+        // First test of a double-check idiom.  Second test is inside wake_some(0).
+        if( my_slack>0 ) {
+            int active_threads = 0;
+            if( try_get_active_thread() ) {
+                ++active_threads;
+                if( try_get_active_thread() ) {
+                    ++active_threads;
+                }
+                wake_some( 0, active_threads );
+            }
+        }
+    }
+
+    //! Try to add t to list of sleeping workers
+    bool try_insert_in_asleep_list(ipc_worker& t);
+
+    //! Try to add t to list of sleeping workers even if there is some work to do
+    bool try_insert_in_asleep_list_forced(ipc_worker& t);
+
+    //! Equivalent of adding additional_slack to my_slack and waking up to 2 threads if my_slack permits.
+    void wake_some(int additional_slack, int active_threads);
+
+    //! Equivalent of adding additional_slack to my_slack and waking up to 1 thread if my_slack permits.
+    void wake_one_forced(int additional_slack);
+
+    //! Stop one thread from asleep list
+    bool stop_one();
+
+    //! Wait for active thread
+    bool wait_active_thread();
+
+    //! Try to get active thread
+    bool try_get_active_thread();
+
+    //! Release active thread
+    void release_active_thread();
+
+    //! Wait for thread to stop
+    bool wait_stop_thread();
+
+    //! Add thread to stop list
+    void add_stop_thread();
+
+    void remove_server_ref() {
+        if( --my_ref_count==0 ) {
+            my_client.acknowledge_close_connection();
+            this->~ipc_server();
+            tbb::cache_aligned_allocator<ipc_server>().deallocate( this, 1 );
+        }
+    }
+
+    friend class ipc_worker;
+    friend class ipc_waker;
+    friend class ipc_stopper;
+public:
+    ipc_server(tbb_client& client);
+    virtual ~ipc_server();
+
+    version_type version() const __TBB_override {
+        return 0;
+    }
+
+    void request_close_connection(bool /*exiting*/) __TBB_override {
+        my_waker->start_shutdown(false);
+        my_stopper->start_shutdown(false);
+        for( size_t i=0; i<my_n_thread; ++i )
+            my_thread_array[i].start_shutdown( my_join_workers );
+        remove_server_ref();
+    }
+
+    void yield() __TBB_override {__TBB_Yield();}
+
+    void independent_thread_number_changed(int) __TBB_override { __TBB_ASSERT( false, NULL ); }
+
+    unsigned default_concurrency() const __TBB_override { return my_n_thread - 1; }
+
+    void adjust_job_count_estimate(int delta) __TBB_override;
+
+#if _WIN32||_WIN64
+    void register_master(::rml::server::execution_resource_t&) __TBB_override {}
+    void unregister_master(::rml::server::execution_resource_t) __TBB_override {}
+#endif /* _WIN32||_WIN64 */
+};
+
+//------------------------------------------------------------------------
+// Methods of ipc_worker
+//------------------------------------------------------------------------
+#if _MSC_VER && !defined(__INTEL_COMPILER)
+    // Suppress overzealous compiler warnings about an initialized variable 'sink_for_alloca' not referenced
+    #pragma warning(push)
+    #pragma warning(disable:4189)
+#endif
+#if __MINGW32__ && __GNUC__==4 &&__GNUC_MINOR__>=2 && !__MINGW64__
+// ensure that stack is properly aligned
+__attribute__((force_align_arg_pointer))
+#endif
+__RML_DECL_THREAD_ROUTINE ipc_worker::thread_routine(void* arg) {
+    ipc_worker* self = static_cast<ipc_worker*>(arg);
+    AVOID_64K_ALIASING( self->my_index );
+    self->run();
+    return 0;
+}
+#if _MSC_VER && !defined(__INTEL_COMPILER)
+    #pragma warning(pop)
+#endif
+
+void ipc_worker::release_handle(thread_handle handle, bool join) {
+    if( join )
+        ipc_thread_monitor::join( handle );
+    else
+        ipc_thread_monitor::detach_thread( handle );
+}
+
+void ipc_worker::start_shutdown(bool join) {
+    state_t s;
+
+    do {
+        s = my_state;
+        __TBB_ASSERT( s!=st_quit, NULL );
+    } while( my_state.compare_and_swap( st_quit, s )!=s );
+    if( s==st_normal || s==st_starting ) {
+        // May have invalidated invariant for sleeping, so wake up the thread.
+        // Note that the notify() here occurs without maintaining invariants for my_slack.
+        // It does not matter, because my_state==st_quit overrides checking of my_slack.
+        my_thread_monitor.notify();
+        // Do not need release handle in st_init state,
+        // because in this case the thread wasn't started yet.
+        // For st_starting release is done at launch site.
+        if( s==st_normal )
+            release_handle( my_handle, join );
+    }
+}
+
+void ipc_worker::start_stopping(bool join) {
+    state_t s;
+
+    do {
+        s = my_state;
+    } while( my_state.compare_and_swap( st_stop, s )!=s );
+    if( s==st_normal || s==st_starting ) {
+        // May have invalidated invariant for sleeping, so wake up the thread.
+        // Note that the notify() here occurs without maintaining invariants for my_slack.
+        // It does not matter, because my_state==st_quit overrides checking of my_slack.
+        my_thread_monitor.notify();
+        // Do not need release handle in st_init state,
+        // because in this case the thread wasn't started yet.
+        // For st_starting release is done at launch site.
+        if( s==st_normal )
+            release_handle( my_handle, join );
+    }
+}
+
+void ipc_worker::run() {
+    my_server.propagate_chain_reaction();
+
+    // Transiting to st_normal here would require setting my_handle,
+    // which would create race with the launching thread and
+    // complications in handle management on Windows.
+
+    ::rml::job& j = *my_client.create_one_job();
+    state_t state = my_state;
+    while( state!=st_quit && state!=st_stop ) {
+        if( my_server.my_slack>=0 ) {
+            my_client.process(j);
+        } else {
+            ipc_thread_monitor::cookie c;
+            // Prepare to wait
+            my_thread_monitor.prepare_wait(c);
+            // Check/set the invariant for sleeping
+            state = my_state;
+            if( state!=st_quit && state!=st_stop && my_server.try_insert_in_asleep_list(*this) ) {
+                if( my_server.my_n_thread > 1 ) my_server.release_active_thread();
+                my_thread_monitor.commit_wait(c);
+                my_server.propagate_chain_reaction();
+            } else {
+                // Invariant broken
+                my_thread_monitor.cancel_wait();
+            }
+        }
+        state = my_state;
+    }
+    my_client.cleanup(j);
+
+    my_server.remove_server_ref();
+}
+
+inline bool ipc_worker::wake_or_launch() {
+    if( ( my_state==st_init && my_state.compare_and_swap( st_starting, st_init )==st_init ) ||
+        ( my_state==st_stop && my_state.compare_and_swap( st_starting, st_stop )==st_stop ) ) {
+        // after this point, remove_server_ref() must be done by created thread
+#if USE_WINTHREAD
+        my_handle = ipc_thread_monitor::launch( thread_routine, this, my_server.my_stack_size, &this->my_index );
+#elif USE_PTHREAD
+        {
+        affinity_helper fpa;
+        fpa.protect_affinity_mask( /*restore_process_mask=*/true );
+        my_handle = ipc_thread_monitor::launch( thread_routine, this, my_server.my_stack_size );
+        if( my_handle == 0 ) {
+            // Unable to create new thread for process
+            // However, this is expected situation for the use cases of this coordination server
+            state_t s = my_state.compare_and_swap( st_init, st_starting );
+            if (st_starting != s) {
+                // Do shutdown during startup. my_handle can't be released
+                // by start_shutdown, because my_handle value might be not set yet
+                // at time of transition from st_starting to st_quit.
+                __TBB_ASSERT( s==st_quit, NULL );
+                release_handle( my_handle, my_server.my_join_workers );
+            }
+            return false;
+        } else {
+            my_server.my_ref_count++;
+        }
+        // Implicit destruction of fpa resets original affinity mask.
+        }
+#endif /* USE_PTHREAD */
+        state_t s = my_state.compare_and_swap( st_normal, st_starting );
+        if( st_starting!=s ) {
+            // Do shutdown during startup. my_handle can't be released
+            // by start_shutdown, because my_handle value might be not set yet
+            // at time of transition from st_starting to st_quit.
+            __TBB_ASSERT( s==st_quit, NULL );
+            release_handle( my_handle, my_server.my_join_workers );
+        }
+    }
+    else {
+        my_thread_monitor.notify();
+    }
+
+    return true;
+}
+
+//------------------------------------------------------------------------
+// Methods of ipc_waker
+//------------------------------------------------------------------------
+#if _MSC_VER && !defined(__INTEL_COMPILER)
+    // Suppress overzealous compiler warnings about an initialized variable 'sink_for_alloca' not referenced
+    #pragma warning(push)
+    #pragma warning(disable:4189)
+#endif
+#if __MINGW32__ && __GNUC__==4 &&__GNUC_MINOR__>=2 && !__MINGW64__
+// ensure that stack is properly aligned
+__attribute__((force_align_arg_pointer))
+#endif
+__RML_DECL_THREAD_ROUTINE ipc_waker::thread_routine(void* arg) {
+    ipc_waker* self = static_cast<ipc_waker*>(arg);
+    AVOID_64K_ALIASING( self->my_index );
+    self->run();
+    return 0;
+}
+#if _MSC_VER && !defined(__INTEL_COMPILER)
+    #pragma warning(pop)
+#endif
+
+void ipc_waker::run() {
+    // Transiting to st_normal here would require setting my_handle,
+    // which would create race with the launching thread and
+    // complications in handle management on Windows.
+
+    while( my_state!=st_quit ) {
+        bool have_to_sleep = false;
+        if( my_server.my_slack>0 ) {
+            if( my_server.wait_active_thread() ) {
+                if( my_server.my_slack>0 ) {
+                    my_server.wake_some( 0, 1 );
+                } else {
+                    my_server.release_active_thread();
+                    have_to_sleep = true;
+                }
+            }
+        } else {
+            have_to_sleep = true;
+        }
+        if( have_to_sleep ) {
+            ipc_thread_monitor::cookie c;
+            // Prepare to wait
+            my_thread_monitor.prepare_wait(c);
+            // Check/set the invariant for sleeping
+            if( my_state!=st_quit && my_server.my_slack<0 ) {
+                my_thread_monitor.commit_wait(c);
+            } else {
+                // Invariant broken
+                my_thread_monitor.cancel_wait();
+            }
+        }
+    }
+
+    my_server.remove_server_ref();
+}
+
+inline bool ipc_waker::wake_or_launch() {
+    if( my_state==st_init && my_state.compare_and_swap( st_starting, st_init )==st_init ) {
+        // after this point, remove_server_ref() must be done by created thread
+#if USE_WINTHREAD
+        my_handle = ipc_thread_monitor::launch( thread_routine, this, my_server.my_stack_size, &this->my_index );
+#elif USE_PTHREAD
+        {
+        affinity_helper fpa;
+        fpa.protect_affinity_mask( /*restore_process_mask=*/true );
+        my_handle = ipc_thread_monitor::launch( thread_routine, this, my_server.my_stack_size );
+        if( my_handle == 0 ) {
+            runtime_warning( "Unable to create new thread for process %d", getpid() );
+            state_t s = my_state.compare_and_swap( st_init, st_starting );
+            if (st_starting != s) {
+                // Do shutdown during startup. my_handle can't be released
+                // by start_shutdown, because my_handle value might be not set yet
+                // at time of transition from st_starting to st_quit.
+                __TBB_ASSERT( s==st_quit, NULL );
+                release_handle( my_handle, my_server.my_join_workers );
+            }
+            return false;
+        } else {
+            my_server.my_ref_count++;
+        }
+        // Implicit destruction of fpa resets original affinity mask.
+        }
+#endif /* USE_PTHREAD */
+        state_t s = my_state.compare_and_swap( st_normal, st_starting );
+        if( st_starting!=s ) {
+            // Do shutdown during startup. my_handle can't be released
+            // by start_shutdown, because my_handle value might be not set yet
+            // at time of transition from st_starting to st_quit.
+            __TBB_ASSERT( s==st_quit, NULL );
+            release_handle( my_handle, my_server.my_join_workers );
+        }
+    }
+    else {
+        my_thread_monitor.notify();
+    }
+
+    return true;
+}
+
+//------------------------------------------------------------------------
+// Methods of ipc_stopper
+//------------------------------------------------------------------------
+#if _MSC_VER && !defined(__INTEL_COMPILER)
+    // Suppress overzealous compiler warnings about an initialized variable 'sink_for_alloca' not referenced
+    #pragma warning(push)
+    #pragma warning(disable:4189)
+#endif
+#if __MINGW32__ && __GNUC__==4 &&__GNUC_MINOR__>=2 && !__MINGW64__
+// ensure that stack is properly aligned
+__attribute__((force_align_arg_pointer))
+#endif
+__RML_DECL_THREAD_ROUTINE ipc_stopper::thread_routine(void* arg) {
+    ipc_stopper* self = static_cast<ipc_stopper*>(arg);
+    AVOID_64K_ALIASING( self->my_index );
+    self->run();
+    return 0;
+}
+#if _MSC_VER && !defined(__INTEL_COMPILER)
+    #pragma warning(pop)
+#endif
+
+void ipc_stopper::run() {
+    // Transiting to st_normal here would require setting my_handle,
+    // which would create race with the launching thread and
+    // complications in handle management on Windows.
+
+    while( my_state!=st_quit ) {
+        if( my_server.wait_stop_thread() ) {
+            if( my_state!=st_quit ) {
+                if( !my_server.stop_one() ) {
+                    my_server.add_stop_thread();
+                    prolonged_pause();
+                }
+            }
+        }
+    }
+
+    my_server.remove_server_ref();
+}
+
+inline bool ipc_stopper::wake_or_launch() {
+    if( my_state==st_init && my_state.compare_and_swap( st_starting, st_init )==st_init ) {
+        // after this point, remove_server_ref() must be done by created thread
+#if USE_WINTHREAD
+        my_handle = ipc_thread_monitor::launch( thread_routine, this, my_server.my_stack_size, &this->my_index );
+#elif USE_PTHREAD
+        {
+        affinity_helper fpa;
+        fpa.protect_affinity_mask( /*restore_process_mask=*/true );
+        my_handle = ipc_thread_monitor::launch( thread_routine, this, my_server.my_stack_size );
+        if( my_handle == 0 ) {
+            runtime_warning( "Unable to create new thread for process %d", getpid() );
+            state_t s = my_state.compare_and_swap( st_init, st_starting );
+            if (st_starting != s) {
+                // Do shutdown during startup. my_handle can't be released
+                // by start_shutdown, because my_handle value might be not set yet
+                // at time of transition from st_starting to st_quit.
+                __TBB_ASSERT( s==st_quit, NULL );
+                release_handle( my_handle, my_server.my_join_workers );
+            }
+            return false;
+        } else {
+            my_server.my_ref_count++;
+        }
+        // Implicit destruction of fpa resets original affinity mask.
+        }
+#endif /* USE_PTHREAD */
+        state_t s = my_state.compare_and_swap( st_normal, st_starting );
+        if( st_starting!=s ) {
+            // Do shutdown during startup. my_handle can't be released
+            // by start_shutdown, because my_handle value might be not set yet
+            // at time of transition from st_starting to st_quit.
+            __TBB_ASSERT( s==st_quit, NULL );
+            release_handle( my_handle, my_server.my_join_workers );
+        }
+    }
+    else {
+        my_thread_monitor.notify();
+    }
+
+    return true;
+}
+
+//------------------------------------------------------------------------
+// Methods of ipc_server
+//------------------------------------------------------------------------
+ipc_server::ipc_server(tbb_client& client) :
+    my_client( client ),
+    my_stack_size( client.min_stack_size() ),
+    my_thread_array(NULL),
+    my_waker(NULL),
+    my_stopper(NULL),
+    my_join_workers(false)
+{
+    my_ref_count = 1;
+    my_slack = 0;
+#if TBB_USE_ASSERT
+    my_net_slack_requests = 0;
+#endif /* TBB_USE_ASSERT */
+    my_n_thread = get_num_threads(IPC_MAX_THREADS_VAR_NAME);
+    if( my_n_thread==0 ) {
+        my_n_thread = AvailableHwConcurrency();
+        __TBB_ASSERT( my_n_thread>0, NULL );
+    }
+
+    my_asleep_list_root = NULL;
+    my_thread_array = tbb::cache_aligned_allocator<padded_ipc_worker>().allocate( my_n_thread );
+    memset( my_thread_array, 0, sizeof(padded_ipc_worker)*my_n_thread );
+    for( size_t i=0; i<my_n_thread; ++i ) {
+        ipc_worker* t = new( &my_thread_array[i] ) padded_ipc_worker( *this, client, i );
+        t->my_next = my_asleep_list_root;
+        my_asleep_list_root = t;
+    }
+
+    my_waker = tbb::cache_aligned_allocator<ipc_waker>().allocate(1);
+    memset( my_waker, 0, sizeof(ipc_waker) );
+    new( my_waker ) ipc_waker( *this, client, my_n_thread );
+
+    my_stopper = tbb::cache_aligned_allocator<ipc_stopper>().allocate(1);
+    memset( my_stopper, 0, sizeof(ipc_stopper) );
+    new( my_stopper ) ipc_stopper( *this, client, my_n_thread + 1 );
+
+    char* active_sem_name = get_active_sem_name();
+    my_active_sem = sem_open( active_sem_name, O_CREAT, IPC_SEM_MODE, my_n_thread - 1 );
+    __TBB_ASSERT( my_active_sem, "Unable to open active threads semaphore" );
+    delete[] active_sem_name;
+
+    char* stop_sem_name = get_stop_sem_name();
+    my_stop_sem = sem_open( stop_sem_name, O_CREAT, IPC_SEM_MODE, 0 );
+    __TBB_ASSERT( my_stop_sem, "Unable to open stop threads semaphore" );
+    delete[] stop_sem_name;
+}
+
+ipc_server::~ipc_server() {
+    __TBB_ASSERT( my_net_slack_requests==0, NULL );
+
+    for( size_t i=my_n_thread; i--; )
+        my_thread_array[i].~padded_ipc_worker();
+    tbb::cache_aligned_allocator<padded_ipc_worker>().deallocate( my_thread_array, my_n_thread );
+    tbb::internal::poison_pointer( my_thread_array );
+
+    my_waker->~ipc_waker();
+    tbb::cache_aligned_allocator<ipc_waker>().deallocate( my_waker, 1 );
+    tbb::internal::poison_pointer( my_waker );
+
+    my_stopper->~ipc_stopper();
+    tbb::cache_aligned_allocator<ipc_stopper>().deallocate( my_stopper, 1 );
+    tbb::internal::poison_pointer( my_stopper );
+
+    sem_close( my_active_sem );
+    sem_close( my_stop_sem );
+}
+
+inline bool ipc_server::try_insert_in_asleep_list(ipc_worker& t) {
+    asleep_list_mutex_type::scoped_lock lock;
+    if( !lock.try_acquire( my_asleep_list_mutex ) )
+        return false;
+    // Contribute to slack under lock so that if another takes that unit of slack,
+    // it sees us sleeping on the list and wakes us up.
+    int k = ++my_slack;
+    if( k<=0 ) {
+        t.my_next = my_asleep_list_root;
+        my_asleep_list_root = &t;
+        return true;
+    } else {
+        --my_slack;
+        return false;
+    }
+}
+
+inline bool ipc_server::try_insert_in_asleep_list_forced(ipc_worker& t) {
+    asleep_list_mutex_type::scoped_lock lock;
+    if( !lock.try_acquire( my_asleep_list_mutex ) )
+        return false;
+    // Contribute to slack under lock so that if another takes that unit of slack,
+    // it sees us sleeping on the list and wakes us up.
+    ++my_slack;
+    t.my_next = my_asleep_list_root;
+    my_asleep_list_root = &t;
+    return true;
+}
+
+inline bool ipc_server::wait_active_thread() {
+    if( sem_wait( my_active_sem ) == 0 ) {
+        ++my_global_thread_count;
+        return true;
+    }
+    return false;
+}
+
+inline bool ipc_server::try_get_active_thread() {
+    if( sem_trywait( my_active_sem ) == 0 ) {
+        ++my_global_thread_count;
+        return true;
+    }
+    return false;
+}
+
+inline void ipc_server::release_active_thread() {
+    release_thread_sem( my_active_sem );
+}
+
+inline bool ipc_server::wait_stop_thread() {
+    struct timespec ts;
+    if( clock_gettime( CLOCK_REALTIME, &ts )==0 ) {
+        ts.tv_sec++;
+        if( sem_timedwait( my_stop_sem, &ts )==0 ) {
+            return true;
+        }
+    }
+    return false;
+}
+
+inline void ipc_server::add_stop_thread() {
+    sem_post( my_stop_sem );
+}
+
+void ipc_server::wake_some( int additional_slack, int active_threads ) {
+    __TBB_ASSERT( additional_slack>=0, NULL );
+    ipc_worker* wakee[2];
+    ipc_worker **w = wakee;
+    {
+        asleep_list_mutex_type::scoped_lock lock(my_asleep_list_mutex);
+        while( active_threads>0 && my_asleep_list_root && w<wakee+2 ) {
+            if( additional_slack>0 ) {
+                if( additional_slack+my_slack<=0 ) // additional demand does not exceed surplus supply
+                    break;
+                --additional_slack;
+            } else {
+                // Chain reaction; Try to claim unit of slack
+                int old;
+                do {
+                    old = my_slack;
+                    if( old<=0 ) goto done;
+                } while( my_slack.compare_and_swap( old-1, old )!=old );
+            }
+            // Pop sleeping worker to combine with claimed unit of slack
+            my_asleep_list_root = (*w++ = my_asleep_list_root)->my_next;
+            --active_threads;
+        }
+        if( additional_slack ) {
+            // Contribute our unused slack to my_slack.
+            my_slack += additional_slack;
+        }
+    }
+done:
+    while( w>wakee ) {
+        if( !(*--w)->wake_or_launch() ) {
+            add_stop_thread();
+            do {
+            } while( !try_insert_in_asleep_list_forced(**w) );
+            release_active_thread();
+        }
+    }
+    while( active_threads ) {
+        release_active_thread();
+        --active_threads;
+    }
+}
+
+void ipc_server::wake_one_forced( int additional_slack ) {
+    __TBB_ASSERT( additional_slack>=0, NULL );
+    ipc_worker* wakee[1];
+    ipc_worker **w = wakee;
+    {
+        asleep_list_mutex_type::scoped_lock lock(my_asleep_list_mutex);
+        while( my_asleep_list_root && w<wakee+1 ) {
+            if( additional_slack>0 ) {
+                if( additional_slack+my_slack<=0 ) // additional demand does not exceed surplus supply
+                    break;
+                --additional_slack;
+            } else {
+                // Chain reaction; Try to claim unit of slack
+                int old;
+                do {
+                    old = my_slack;
+                    if( old<=0 ) goto done;
+                } while( my_slack.compare_and_swap( old-1, old )!=old );
+            }
+            // Pop sleeping worker to combine with claimed unit of slack
+            my_asleep_list_root = (*w++ = my_asleep_list_root)->my_next;
+        }
+        if( additional_slack ) {
+            // Contribute our unused slack to my_slack.
+            my_slack += additional_slack;
+        }
+    }
+done:
+    while( w>wakee ) {
+        if( !(*--w)->wake_or_launch() ) {
+            add_stop_thread();
+            do {
+            } while( !try_insert_in_asleep_list_forced(**w) );
+        }
+    }
+}
+
+bool ipc_server::stop_one() {
+    ipc_worker* current = NULL;
+    ipc_worker* next = NULL;
+    {
+        asleep_list_mutex_type::scoped_lock lock(my_asleep_list_mutex);
+        if( my_asleep_list_root ) {
+            current = my_asleep_list_root;
+            if( current->my_state==ipc_worker::st_normal ) {
+                next = current->my_next;
+                while( next!= NULL && next->my_state==ipc_worker::st_normal ) {
+                    current = next;
+                    next = current->my_next;
+                }
+                current->start_stopping( my_join_workers );
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
+void ipc_server::adjust_job_count_estimate( int delta ) {
+#if TBB_USE_ASSERT
+    my_net_slack_requests+=delta;
+#endif /* TBB_USE_ASSERT */
+    if( my_n_thread > 1 ) {
+        if( delta<0 ) {
+            my_slack+=delta;
+        } else if( delta>0 ) {
+            int active_threads = 0;
+            if( try_get_active_thread() ) {
+                ++active_threads;
+                if( try_get_active_thread() ) {
+                    ++active_threads;
+                }
+            }
+            wake_some( delta, active_threads );
+
+            if( !my_waker->wake_or_launch() ) {
+                add_stop_thread();
+            }
+            if( !my_stopper->wake_or_launch() ) {
+                add_stop_thread();
+            }
+        }
+    } else { // Corner case when RML shouldn't provide any worker thread but client has to have at least one
+        if( delta<0 ) {
+            my_slack += delta;
+        } else {
+            wake_one_forced( delta );
+        }
+    }
+}
+
+//------------------------------------------------------------------------
+// RML factory methods
+//------------------------------------------------------------------------
+
+#if USE_PTHREAD
+
+static tbb_client* my_global_client = NULL;
+static tbb_server* my_global_server = NULL;
+
+void rml_atexit() {
+    release_resources();
+}
+
+void rml_atfork_child() {
+    if( my_global_server!=NULL && my_global_client!=NULL ) {
+        ipc_server* server = static_cast<ipc_server*>( my_global_server );
+        server->~ipc_server();
+        memset( server, 0, sizeof(ipc_server) );
+        new( server ) ipc_server( *my_global_client );
+        pthread_atfork( NULL, NULL, rml_atfork_child );
+        atexit( rml_atexit );
+    }
+}
+
+#endif /* USE_PTHREAD */
+
+extern "C" tbb_factory::status_type __TBB_make_rml_server(tbb_factory& f, tbb_server*& server, tbb_client& client) {
+    server = new( tbb::cache_aligned_allocator<ipc_server>().allocate(1) ) ipc_server(client);
+#if USE_PTHREAD
+    my_global_client = &client;
+    my_global_server = server;
+    pthread_atfork( NULL, NULL, rml_atfork_child );
+    atexit( rml_atexit );
+#endif /* USE_PTHREAD */
+    if( getenv( "RML_DEBUG" ) ) {
+        runtime_warning("IPC server is started");
+    }
+    return tbb_factory::st_success;
+}
+
+extern "C" void __TBB_call_with_my_server_info(::rml::server_info_callback_t cb, void* arg) {
+}
+
+} // namespace rml
+} // namespace internal
+
+} // namespace tbb
diff --git a/python/rml/ipc_utils.cpp b/python/rml/ipc_utils.cpp
new file mode 100644
index 0000000000..a7a62c6332
--- /dev/null
+++ b/python/rml/ipc_utils.cpp
@@ -0,0 +1,144 @@
+/*
+    Copyright (c) 2017 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+
+
+
+*/
+
+#include "ipc_utils.h"
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <limits.h>
+#include <string.h>
+#include <unistd.h>
+
+namespace tbb {
+namespace internal {
+namespace rml {
+
+#define MAX_STR_LEN 255
+#define STARTTIME_ITEM_ID 21
+
+static char* get_stat_item(char* line, int item_id) {
+    int id = 0, i = 0;
+
+    while( id!=item_id ) {
+        while( line[i]!='(' && line[i]!=' ' && line[i]!='\0' ) {
+            ++i;
+        }
+        if( line[i]==' ' ) {
+            ++id;
+            ++i;
+        } else if( line[i]=='(' ) {
+            while( line[i]!=')' && line[i]!='\0' ) {
+               ++i;
+            }
+            if( line[i]==')' ) {
+                ++i;
+            } else {
+                return NULL;
+            }
+        } else {
+            return NULL;
+        }
+    }
+
+    return line + i;
+}
+
+unsigned long long get_start_time(int pid) {
+    const char* stat_file_path_template = "/proc/%d/stat";
+    char stat_file_path[MAX_STR_LEN + 1];
+    sprintf( stat_file_path, stat_file_path_template, pid );
+
+    FILE* stat_file = fopen( stat_file_path, "rt" );
+    if( stat_file==NULL ) {
+        return 0;
+    }
+
+    char stat_line[MAX_STR_LEN + 1];
+    char* line = fgets( stat_line, MAX_STR_LEN, stat_file );
+    if( line==NULL ) {
+        return 0;
+    }
+
+    char* starttime_str = get_stat_item( stat_line, STARTTIME_ITEM_ID );
+    if( starttime_str==NULL ) {
+        return 0;
+    }
+
+    unsigned long long starttime = strtoull( starttime_str, NULL, 10 );
+    if( starttime==ULLONG_MAX ) {
+        return 0;
+    }
+
+    return starttime;
+}
+
+char* get_shared_name(const char* prefix, int pid, unsigned long long time) {
+    const char* name_template = "%s_%d_%llu";
+    const int digits_in_int = 10;
+    const int digits_in_long = 20;
+
+    int len = strlen( name_template ) + strlen( prefix ) + digits_in_int + digits_in_long + 1;
+    char* name = new char[len];
+    sprintf( name, name_template, prefix, pid, time );
+
+    return name;
+}
+
+char* get_shared_name(const char* prefix) {
+    int pid = getpgrp();
+    unsigned long long time = get_start_time( pid );
+    return get_shared_name( prefix, pid, time );
+}
+
+int get_num_threads(const char* env_var) {
+    if( env_var==NULL ) {
+        return 0;
+    }
+
+    char* value = getenv( env_var );
+    if( value==NULL ) {
+        return 0;
+    }
+
+    int num_threads = (int)strtol( value, NULL, 10 );
+    return num_threads;
+}
+
+bool get_enable_flag(const char* env_var) {
+    if( env_var==NULL ) {
+        return false;
+    }
+
+    char* value = getenv( env_var );
+    if( value==NULL ) {
+        return false;
+    }
+
+    if( strcmp( value, "0" ) == 0 ||
+        strcmp( value, "false" ) == 0 ||
+        strcmp( value, "False" ) == 0 ||
+        strcmp( value, "FALSE" ) == 0 ) {
+        return false;
+    }
+
+    return true;
+}
+
+}}} //tbb::internal::rml
diff --git a/python/rml/ipc_utils.h b/python/rml/ipc_utils.h
new file mode 100644
index 0000000000..2ca4626bc0
--- /dev/null
+++ b/python/rml/ipc_utils.h
@@ -0,0 +1,34 @@
+/*
+    Copyright (c) 2017 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+
+
+
+*/
+
+#ifndef __IPC_UTILS_H
+#define __IPC_UTILS_H
+
+namespace tbb {
+namespace internal {
+namespace rml {
+
+char* get_shared_name(const char* prefix);
+int get_num_threads(const char* env_var);
+bool get_enable_flag(const char* env_var);
+
+}}} //tbb::internal::rml
+
+#endif
diff --git a/python/setup.py b/python/setup.py
index 89e74a694a..62e8e08ce7 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -18,14 +18,19 @@
 #
 #
 
+
 # System imports
 from __future__ import print_function
+from glob import glob
 import platform
 import os
 
 from distutils.core import *
 from distutils.command.build import build
 
+rundir = os.getcwd()
+os.chdir(os.path.abspath(os.path.dirname(__file__)))
+
 if any(i in os.environ for i in ["CC", "CXX"]):
     if "CC" not in os.environ:
         os.environ['CC'] = os.environ['CXX']
@@ -33,17 +38,18 @@
         os.environ['CXX'] = os.environ['CC']
     if platform.system() == 'Linux':
         os.environ['LDSHARED'] = os.environ['CXX'] + " -shared"
+    print("Environment specifies CC=%s CXX=%s"%(os.environ['CC'], os.environ['CXX']))
 
 intel_compiler = os.getenv('CC', '') in ['icl', 'icpc', 'icc']
 try:
     tbb_root = os.environ['TBBROOT']
     print("Using TBBROOT=", tbb_root)
 except:
-    tbb_root = '.'
+    tbb_root = '..'
     if not intel_compiler:
         print("Warning: TBBROOT env var is not set and Intel's compiler is not used. It might lead\n"
               "    !!!: to compile/link problems. Source tbbvars.sh/.csh file to set environment")
-use_compiler_tbb = intel_compiler and tbb_root == '.'
+use_compiler_tbb = intel_compiler and tbb_root == '..'
 if use_compiler_tbb:
     print("Using Intel TBB from Intel's compiler")
 if platform.system() == 'Windows':
@@ -52,21 +58,24 @@
         os.environ['MSSdk'] = '1'
         print("Using compiler settings from environment")
     tbb_flag = ['/Qtbb'] if use_compiler_tbb else []
+    tbb_flag += ['/EHsc'] # for Python 2
     compile_flags = ['/Qstd=c++11'] if intel_compiler else []
 else:
     tbb_flag = ['-tbb'] if use_compiler_tbb else []
     compile_flags = ['-std=c++11', '-Wno-unused-variable']
 
-_tbb = Extension("_TBB", ["tbb.i"],
+_tbb = Extension("tbb._api", ["tbb/api.i"],
         include_dirs=[os.path.join(tbb_root, 'include')] if not use_compiler_tbb else [],
         swig_opts   =['-c++', '-O', '-threads'] + (  # add '-builtin' later
               ['-I' + os.path.join(tbb_root, 'include')] if not use_compiler_tbb else []),
         extra_compile_args=compile_flags + tbb_flag,
         extra_link_args=tbb_flag,
-        libraries   =['tbb'] if not use_compiler_tbb else [],
-        library_dirs=[os.path.join(tbb_root, 'lib', 'intel64', 'gcc4.4'),  # for Linux
-                      os.path.join(tbb_root, 'lib'),                       # for MacOS
-                      os.path.join(tbb_root, 'lib', 'intel64', 'vc_mt'),   # for Windows
+        libraries   =(['tbb'] if not use_compiler_tbb else []) +
+                     (['irml'] if platform.system() == "Linux" else []),   # TODO: why do we need this?
+        library_dirs=[ rundir,                                              # for custom-builds
+                       os.path.join(tbb_root, 'lib', 'intel64', 'gcc4.4'),  # for Linux
+                       os.path.join(tbb_root, 'lib'),                       # for MacOS
+                       os.path.join(tbb_root, 'lib', 'intel64', 'vc_mt'),   # for Windows
                      ] if not use_compiler_tbb else [],
         language    ='c++',
         )
@@ -86,7 +95,7 @@ class TBBBuild(build):
         url         ="https://software.intel.com/en-us/intel-tbb",
         author      ="Intel Corporation",
         author_email="inteltbbdevelopers@intel.com",
-        license     ="Dual license: Apache or Intel Simplified Software License",
+        license     ="Dual license: Apache or Proprietary",
         version     ="0.1",
         classifiers =[
             'Development Status :: 4 - Beta',
@@ -97,11 +106,9 @@ class TBBBuild(build):
             'Intended Audience :: Other Audience',
             'Intended Audience :: Science/Research',
             'License :: OSI Approved :: Apache Software License',
-            'License :: Other/Intel Simplified Software License',
             'Operating System :: MacOS :: MacOS X',
             'Operating System :: Microsoft :: Windows',
-            'Operating System :: POSIX',
-            'Operating System :: Unix',
+            'Operating System :: POSIX :: Linux',
             'Programming Language :: Python',
             'Programming Language :: Python :: 2',
             'Programming Language :: Python :: 3',
@@ -109,8 +116,9 @@ class TBBBuild(build):
             'Topic :: System :: Hardware :: Symmetric Multi-processing',
             'Topic :: Software Development :: Libraries',
           ],
-        keywords='tbb multiprocessing multithreading composable parallelism',
+        keywords='TBB multiprocessing multithreading composable parallelism',
         ext_modules=[_tbb],
+        packages=['tbb'],
         py_modules=['TBB'],
         cmdclass={'build': TBBBuild}
 )
diff --git a/python/tbb/__init__.py b/python/tbb/__init__.py
new file mode 100644
index 0000000000..20d9a78450
--- /dev/null
+++ b/python/tbb/__init__.py
@@ -0,0 +1,322 @@
+#!/usr/bin/env python
+#
+# Copyright (c) 2016-2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#
+#
+#
+
+
+from __future__ import print_function
+
+import multiprocessing.pool
+import ctypes
+import atexit
+import sys
+import os
+ 
+from .api import  *
+from .api import __all__ as api__all
+from .pool import *
+from .pool import __all__ as pool__all
+
+__all__ = ["Monkey", "is_active"] + api__all + pool__all
+
+__doc__ = """
+Python API for Intel(R) Threading Building Blocks library (Intel(R) TBB)
+extended with standard Python's pools implementation and monkey-patching.
+
+Command-line interface example:
+$  python -m tbb $your_script.py
+Runs your_script.py in context of tbb.Monkey
+"""
+
+is_active = False
+""" Indicates whether TBB context is activated """
+
+ipc_enabled = False
+""" Indicates whether IPC mode is enabled """
+
+libirml = "libirml.so.1"
+
+
+def _test(arg=None):
+    """Some tests"""
+    import platform
+    if platform.system() == "Linux":
+        ctypes.CDLL(libirml)
+    from .test import test
+    test(arg)
+    print("done")
+
+
+def tbb_process_pool_worker27(inqueue, outqueue, initializer=None, initargs=(),
+                            maxtasks=None):
+    from multiprocessing.pool import worker
+    worker(inqueue, outqueue, initializer, initargs, maxtasks)
+    if ipc_enabled:
+        try:
+            librml = ctypes.CDLL(libirml)
+            librml.release_resources()
+        except:
+            print("Warning: Can not load ", libirml, file=sys.stderr)
+
+
+class TBBProcessPool27(multiprocessing.pool.Pool):
+    def _repopulate_pool(self):
+        """Bring the number of pool processes up to the specified number,
+        for use after reaping workers which have exited.
+        """
+        from multiprocessing.util import debug
+
+        for i in range(self._processes - len(self._pool)):
+            w = self.Process(target=tbb_process_pool_worker27,
+                             args=(self._inqueue, self._outqueue,
+                                   self._initializer,
+                                   self._initargs, self._maxtasksperchild)
+                            )
+            self._pool.append(w)
+            w.name = w.name.replace('Process', 'PoolWorker')
+            w.daemon = True
+            w.start()
+            debug('added worker')
+
+    def __del__(self):
+        self.close()
+        for p in self._pool:
+            p.join()
+
+    def __exit__(self, *args):
+        self.close()
+        for p in self._pool:
+            p.join()
+
+
+def tbb_process_pool_worker3(inqueue, outqueue, initializer=None, initargs=(),
+                            maxtasks=None, wrap_exception=False):
+    from multiprocessing.pool import worker
+    worker(inqueue, outqueue, initializer, initargs, maxtasks, wrap_exception)
+    if ipc_enabled:
+        try:
+            librml = ctypes.CDLL(libirml)
+            librml.release_resources()
+        except:
+            print("Warning: Can not load ", libirml, file=sys.stderr)
+
+
+class TBBProcessPool3(multiprocessing.pool.Pool):
+    def _repopulate_pool(self):
+        """Bring the number of pool processes up to the specified number,
+        for use after reaping workers which have exited.
+        """
+        from multiprocessing.util import debug
+
+        for i in range(self._processes - len(self._pool)):
+            w = self.Process(target=tbb_process_pool_worker3,
+                             args=(self._inqueue, self._outqueue,
+                                   self._initializer,
+                                   self._initargs, self._maxtasksperchild,
+                                   self._wrap_exception)
+                            )
+            self._pool.append(w)
+            w.name = w.name.replace('Process', 'PoolWorker')
+            w.daemon = True
+            w.start()
+            debug('added worker')
+
+    def __del__(self):
+        self.close()
+        for p in self._pool:
+            p.join()
+
+    def __exit__(self, *args):
+        self.close()
+        for p in self._pool:
+            p.join()
+
+
+class Monkey:
+    """
+    Context manager which replaces standard multiprocessing.pool
+    implementations with tbb.pool using monkey-patching. It also enables TBB
+    threading for Intel(R) Math Kernel Library (Intel(R) MKL). For example:
+
+        with tbb.Monkey():
+            run_my_numpy_code()
+
+    It allows multiple parallel tasks to be executed on the same thread pool
+    and coordinate number of threads across multiple processes thus avoiding
+    overheads from oversubscription.
+    """
+    _items   = {}
+    _modules = {}
+
+    def __init__(self, max_num_threads=None, benchmark=False):
+        """
+        Create context manager for running under TBB scheduler.
+        :param max_num_threads: if specified, limits maximal number of threads
+        :param benchmark: if specified, blocks in initialization until requested number of threads are ready
+        """
+        if max_num_threads:
+            self.ctl = global_control(global_control.max_allowed_parallelism, int(max_num_threads))
+        if benchmark:
+            if not max_num_threads:
+               max_num_threads = default_num_threads()
+            from .api import _concurrency_barrier
+            _concurrency_barrier(int(max_num_threads))
+
+    def _patch(self, class_name, module_name, obj):
+        m = self._modules[class_name] = __import__(module_name, globals(),
+                                                   locals(), [class_name])
+        if m == None:
+            return
+        oldattr = getattr(m, class_name, None)
+        if oldattr == None:
+            self._modules[class_name] = None
+            return
+        self._items[class_name] = oldattr
+        setattr(m, class_name, obj)
+
+    def __enter__(self):
+        global is_active
+        assert is_active == False, "tbb.Monkey does not support nesting yet"
+        is_active = True
+        self.env = os.getenv('MKL_THREADING_LAYER')
+        os.environ['MKL_THREADING_LAYER'] = 'TBB'
+
+        if ipc_enabled:
+            if sys.version_info.major == 2 and sys.version_info.minor >= 7:
+                self._patch("Pool", "multiprocessing.pool", TBBProcessPool27)
+            elif sys.version_info.major == 3 and sys.version_info.minor >= 5:
+                self._patch("Pool", "multiprocessing.pool", TBBProcessPool3)
+        self._patch("ThreadPool", "multiprocessing.pool", Pool)
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        global is_active
+        assert is_active == True, "modified?"
+        is_active = False
+        if self.env is None:
+            del os.environ['MKL_THREADING_LAYER']
+        else:
+            os.environ['MKL_THREADING_LAYER'] = self.env
+        for name in self._items.keys():
+            setattr(self._modules[name], name, self._items[name])
+
+
+def init_sem_name():
+    try:
+        librml = ctypes.CDLL(libirml)
+        librml.set_active_sem_name()
+        librml.set_stop_sem_name()
+    except Exception as e:
+        print("Warning: Can not initialize name of shared semaphores:", e,
+              file=sys.stderr)
+
+
+def tbb_atexit():
+    if ipc_enabled:
+        try:
+            librml = ctypes.CDLL(libirml)
+            librml.release_semaphores()
+        except:
+            print("Warning: Can not release shared semaphores",
+                  file=sys.stderr)
+
+
+def _main():
+    # Run the module specified as the next command line argument
+    # python -m TBB user_app.py
+    global ipc_enabled
+
+    import platform
+    import argparse
+    parser = argparse.ArgumentParser(prog="python -m tbb", description="""
+                Run your Python script in context of tbb.Monkey, which
+                replaces standard Python pools and threading layer of
+                Intel(R) Math Kernel Library by implementation based on
+                Intel(R) Threading Building Blocks. It enables multiple parallel
+                tasks to be executed on the same thread pool and coordinate
+                number of threads across multiple processes thus avoiding
+                overheads from oversubscription.
+             """, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    if platform.system() == "Linux":
+        parser.add_argument('--ipc', action='store_true',
+                        help="Enable inter-process (IPC) coordination between Intel TBB schedulers")
+        parser.add_argument('-a', '--allocator', action='store_true',
+                        help="Enable Intel TBB scalable allocator as a replacement for standard memory allocator")
+        parser.add_argument('--allocator-huge-pages', action='store_true',
+                        help="Enable huge pages for Intel TBB allocator (implies: -a)")
+    parser.add_argument('-p', '--max-num-threads', default=default_num_threads(), type=int,
+                        help="Initialize Intel TBB with P max number of threads per process", metavar='P')
+    parser.add_argument('-b', '--benchmark', action='store_true',
+                        help="Block Intel TBB initialization until all the threads are created before continue the script. "
+                        "This is necessary for performance benchmarks that want to exclude lazy scheduler initialization effects from the measurements")
+    parser.add_argument('-v', '--verbose', action='store_true',
+                        help="Request verbose and version information")
+    parser.add_argument('-m', action='store_true', dest='module',
+                        help="Executes following as a module")
+    parser.add_argument('name', help="Script or module name")
+    parser.add_argument('args', nargs=argparse.REMAINDER,
+                        help="Command line arguments")
+    args = parser.parse_args()
+
+    if args.verbose:
+        os.environ["TBB_VERSION"] = "1"
+    if platform.system() == "Linux":
+        if args.allocator_huge_pages:
+            args.allocator = True
+        if args.allocator and not os.environ.get("_TBB_MALLOC_PRELOAD"):
+            libtbbmalloc_lib = 'libtbbmalloc_proxy.so.2'
+            ld_preload = 'LD_PRELOAD'
+            os.environ["_TBB_MALLOC_PRELOAD"] = "1"
+            preload_list = filter(None, os.environ.get(ld_preload, "").split(':'))
+            if libtbbmalloc_lib in preload_list:
+                print('Info:', ld_preload, "contains", libtbbmalloc_lib, "already\n")
+            else:
+                os.environ[ld_preload] = ':'.join([libtbbmalloc_lib] + list(preload_list))
+
+            if args.allocator_huge_pages:
+                assert platform.system() == "Linux"
+                try:
+                    with open('/proc/sys/vm/nr_hugepages', 'r') as f:
+                        pages = int(f.read())
+                    if pages == 0:
+                        print("TBB: Pre-allocated huge pages are not currently reserved in the system. To reserve, run e.g.:\n"
+                              "\tsudo sh -c 'echo 2000 > /proc/sys/vm/nr_hugepages'")
+                    os.environ["TBB_MALLOC_USE_HUGE_PAGES"] = "1"
+                except:
+                    print("TBB: Failed to read number of pages from /proc/sys/vm/nr_hugepages\n"
+                          "\tIs the Linux kernel configured with the huge pages feature?")
+                    sys.exit(1)
+
+            os.execl(sys.executable, sys.executable, '-m', 'tbb', *sys.argv[1:])
+            assert False, "Re-execution failed"
+
+    sys.argv = [args.name] + args.args
+    ipc_enabled = platform.system() == "Linux" and args.ipc
+    os.environ["IPC_ENABLE"] = "1" if ipc_enabled else "0"
+    if ipc_enabled:
+        atexit.register(tbb_atexit)
+        init_sem_name()
+    if not os.environ.get("KMP_BLOCKTIME"): # TODO move
+        os.environ["KMP_BLOCKTIME"] = "0"
+    if '_' + args.name in globals():
+        return globals()['_' + args.name](*args.args)
+    else:
+        import runpy
+        runf = runpy.run_module if args.module else runpy.run_path
+        with Monkey(max_num_threads=args.max_num_threads, benchmark=args.benchmark):
+            runf(args.name, run_name='__main__')
diff --git a/python/tbb/__main__.py b/python/tbb/__main__.py
new file mode 100644
index 0000000000..20fea9a88f
--- /dev/null
+++ b/python/tbb/__main__.py
@@ -0,0 +1,24 @@
+#!/usr/bin/env python
+#
+# Copyright (c) 2016-2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#
+#
+#
+
+
+from . import _main
+from sys import exit
+exit(_main())
diff --git a/python/tbb.i b/python/tbb/api.i
similarity index 58%
rename from python/tbb.i
rename to python/tbb/api.i
index 87cc95e9b0..bbc887a6d9 100644
--- a/python/tbb.i
+++ b/python/tbb/api.i
@@ -18,50 +18,31 @@
 #
 #
 
-# Based on the software developed by:
-# Copyright (c) 2008,2016 david decotigny (Pool of threads)
-# Copyright (c) 2006-2008, R Oudkerk (multiprocessing.Pool)
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#
-# 1. Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-# 2. Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-# 3. Neither the name of author nor the names of any contributors may be
-#    used to endorse or promote products derived from this software
-#    without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND
-# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-# ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
-# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-# SUCH DAMAGE.
-#
 
-from __future__ import print_function
+__all__ = ["task_arena", "task_group", "task_scheduler_init", "global_control", "default_num_threads"]
 %}
 %begin %{
 /* Defines Python wrappers for Intel(R) Threading Building Blocks (Intel TBB).*/
 %}
-%module TBB
+%module api
 
 #if SWIG_VERSION < 0x030001
 #error SWIG version 3.0.6 or newer is required for correct functioning
 #endif
 
 %{
+#define TBB_PREVIEW_GLOBAL_CONTROL 1
+#define TBB_PREVIEW_WAITING_FOR_WORKERS 1
 #include <tbb/tbb.h>
+#include <tbb/compat/condition_variable>
+#if TBB_IMPLEMENT_CPP0X
+namespace std { using tbb::mutex; }
+#define unique_ptr auto_ptr
+#else
+#include <condition_variable>
+#include <mutex>
+#include <memory>
+#endif
 using namespace tbb;
 
 class PyCaller : public swig::SwigPtr_PyObject {
@@ -91,8 +72,47 @@ struct ArenaPyCaller {
     }
 };
 
+struct barrier_data {
+    std::condition_variable event;
+    std::mutex m;
+    int worker_threads, full_threads;
+};
+
+class barrier_task : public tbb::task {
+    barrier_data &b;
+public:
+    barrier_task(barrier_data &d) : b(d) {}
+    /*override*/ tbb::task *execute() {
+        std::unique_lock<std::mutex> lock(b.m);
+        if(++b.worker_threads >= b.full_threads)
+            b.event.notify_all();
+        else while(b.worker_threads < b.full_threads)
+            b.event.wait(lock);
+        return NULL;
+    }
+};
+
+void _concurrency_barrier(int threads = tbb::task_scheduler_init::automatic) {
+    if(threads == task_scheduler_init::automatic)
+        threads = task_scheduler_init::default_num_threads();
+    if(threads < 2)
+        return;
+    std::unique_ptr<global_control> g(
+        (global_control::active_value(global_control::max_allowed_parallelism) < unsigned(threads))?
+            new global_control(global_control::max_allowed_parallelism, threads) : NULL);
+    barrier_data b;
+    b.worker_threads = 0;
+    b.full_threads = threads-1;
+    for(int i = 0; i < b.full_threads; i++)
+        tbb::task::enqueue( *new( tbb::task::allocate_root() ) barrier_task(b) );
+    std::unique_lock<std::mutex> lock(b.m);
+    b.event.wait(lock);
+};
+
 %}
 
+void _concurrency_barrier(int threads = tbb::task_scheduler_init::automatic);
+
 namespace tbb {
     class task_scheduler_init {
     public:
@@ -100,13 +120,14 @@ namespace tbb {
         static const int automatic = -1;
         //! Argument to initialize() or constructor that causes initialization to be deferred.
         static const int deferred = -2;
-        task_scheduler_init( int max_threads=automatic, 
+        task_scheduler_init( int max_threads=automatic,
                              size_t thread_stack_size=0 );
         ~task_scheduler_init();
         void initialize( int max_threads=automatic );
         void terminate();
         static int default_num_threads();
         bool is_active() const;
+        void blocking_terminate();
     };
 
     class task_arena {
@@ -139,7 +160,21 @@ namespace tbb {
         };
     };
 
-}
+    class global_control {
+    public:
+        enum parameter {
+            max_allowed_parallelism,
+            thread_stack_size,
+            parameter_max // insert new parameters above this point
+        };
+        global_control(parameter param, size_t value);
+        ~global_control();
+        static size_t active_value(parameter param);
+    };
 
-// Python part of the module
-%pythoncode "tbb.src.py"
+} // tbb
+
+// Additional definitions for Python part of the module
+%pythoncode %{
+default_num_threads = task_scheduler_init_default_num_threads
+%}
diff --git a/python/tbb.src.py b/python/tbb/pool.py
similarity index 79%
rename from python/tbb.src.py
rename to python/tbb/pool.py
index b2d3312d19..4747b22a5a 100644
--- a/python/tbb.src.py
+++ b/python/tbb/pool.py
@@ -1,4 +1,4 @@
-
+#!/usr/bin/env python
 #
 # Copyright (c) 2016-2017 Intel Corporation
 #
@@ -73,20 +73,14 @@
 import sys
 import threading
 import traceback
+from .api import *
 
-__all__ = ["Pool", "Monkey", "task_arena", "task_group", "task_scheduler_init"]
+__all__ = ["Pool", "TimeoutError"]
 __doc__ = """
-Python API to Intel(R) Threading Building Blocks library (Intel(R) TBB)
-extended with standard Pool implementation and monkey-patching.
-
-Command-line interface:
-$ python -m TBB $your_script.py
-
-Runs your_script.py in context of `with Monkey():`
+Standard Python Pool implementation based on Python API
+for Intel(R) Threading Building Blocks library (Intel(R) TBB)
 """
 
-default_num_threads = task_scheduler_init_default_num_threads
-
 
 class TimeoutError(Exception):
     """Raised when a result is not available within the given timeout"""
@@ -96,7 +90,7 @@ class TimeoutError(Exception):
 class Pool(object):
     """
     The Pool class provides standard multiprocessing.Pool interface
-    which is mapped onto Intel TBB tasks executing in its thread pool
+    which is mapped onto Intel(R) TBB tasks executing in its thread pool
     """
 
     def __init__(self, nworkers=0, name="Pool"):
@@ -639,202 +633,3 @@ def notify_ready(self, apply_result):
                     self._to_notify._set_exception()
                 else:
                     self._to_notify._set_value(lst)
-
-
-def _test(arg=None):
-    """Some tests"""
-    if arg == "-v":
-        def say(*x):
-            print(*x)
-    else:
-        def say(*x):
-            pass
-    say("Start Pool testing")
-    import time
-    
-    get_tid = lambda: threading.current_thread().ident
-
-    def return42():
-        return 42
-
-    def f(x):
-        return x * x
-
-    def work(mseconds):
-        res = str(mseconds)
-        if mseconds < 0:
-            mseconds = -mseconds
-        say("[%d] Start to work for %fms..." % (get_tid(), mseconds*10))
-        time.sleep(mseconds/100.)
-        say("[%d] Work done (%fms)." % (get_tid(), mseconds*10))
-        return res
-
-    ### Test copy/pasted from multiprocessing
-    pool = Pool(4)  # start worker threads
-
-    # edge cases
-    assert pool.map(return42, []) == []
-    assert pool.apply_async(return42, []).get() == 42
-    assert pool.apply(return42, []) == 42
-    assert list(pool.imap(return42, iter([]))) == []
-    assert list(pool.imap_unordered(return42, iter([]))) == []
-    assert pool.map_async(return42, []).get() == []
-    assert list(pool.imap_async(return42, iter([])).get()) == []
-    assert list(pool.imap_unordered_async(return42, iter([])).get()) == []
-
-    # basic tests
-    result = pool.apply_async(f, (10,))  # evaluate "f(10)" asynchronously
-    assert result.get(timeout=1) == 100  # ... unless slow computer
-    assert list(pool.map(f, range(10))) == list(map(f, range(10)))
-    it = pool.imap(f, range(10))
-    assert next(it) == 0
-    assert next(it) == 1
-    assert next(it) == 4
-
-    # Test apply_sync exceptions
-    result = pool.apply_async(time.sleep, (3,))
-    try:
-        say(result.get(timeout=1))  # raises `TimeoutError`
-    except TimeoutError:
-        say("Good. Got expected timeout exception.")
-    else:
-        assert False, "Expected exception !"
-    assert result.get() is None  # sleep() returns None
-
-    def cb(s):
-        say("Result ready: %s" % s)
-
-    # Test imap()
-    assert list(pool.imap(work, range(10, 3, -1), chunksize=4)) == list(map(
-        str, range(10, 3, -1)))
-
-    # Test imap_unordered()
-    assert sorted(pool.imap_unordered(work, range(10, 3, -1))) == sorted(map(
-        str, range(10, 3, -1)))
-
-    # Test map_async()
-    result = pool.map_async(work, range(10), callback=cb)
-    try:
-        result.get(timeout=0.01)  # raises `TimeoutError`
-    except TimeoutError:
-        say("Good. Got expected timeout exception.")
-    else:
-        assert False, "Expected exception !"
-    say(result.get())
-
-    # Test imap_async()
-    result = pool.imap_async(work, range(3, 10), callback=cb)
-    try:
-        result.get(timeout=0.01)  # raises `TimeoutError`
-    except TimeoutError:
-        say("Good. Got expected timeout exception.")
-    else:
-        assert False, "Expected exception !"
-    for i in result.get():
-        say("Item:", i)
-    say("### Loop again:")
-    for i in result.get():
-        say("Item2:", i)
-
-    # Test imap_unordered_async()
-    result = pool.imap_unordered_async(work, range(10, 3, -1), callback=cb)
-    try:
-        say(result.get(timeout=0.01))  # raises `TimeoutError`
-    except TimeoutError:
-        say("Good. Got expected timeout exception.")
-    else:
-        assert False, "Expected exception !"
-    for i in result.get():
-        say("Item1:", i)
-    for i in result.get():
-        say("Item2:", i)
-    r = result.get()
-    for i in r:
-        say("Item3:", i)
-    for i in r:
-        say("Item4:", i)
-    for i in r:
-        say("Item5:", i)
-
-    #
-    # The case for the exceptions
-    #
-
-    # Exceptions in imap_unordered_async()
-    result = pool.imap_unordered_async(work, range(2, -10, -1), callback=cb)
-    time.sleep(3)
-    try:
-        for i in result.get():
-            say("Got item:", i)
-    except (IOError, ValueError):
-        say("Good. Got expected exception")
-
-    # Exceptions in imap_async()
-    result = pool.imap_async(work, range(2, -10, -1), callback=cb)
-    time.sleep(3)
-    try:
-        for i in result.get():
-            say("Got item:", i)
-    except (IOError, ValueError):
-        say("Good. Got expected exception")
-
-    # Stop the test: need to stop the pool !!!
-    pool.terminate()
-    pool.join()
-    print("done")
-
-
-# End of david's derived file content
-
-class Monkey:
-    """
-    Context manager which replaces standard multiprocessing.pool.ThreadPool
-    implementation with TBB.Pool using monkey-patching. It also enables TBB
-    threading for Intel(R) Math Kernel Library (Intel(R) MKL). For example:
-
-        with TBB.Monkey():
-            run_my_numpy_code()
-
-    """
-    _items = {'ThreadPool': None}
-
-    def __init__(self):
-        pass
-
-    def __enter__(self):
-        import os
-        self.env = os.getenv('MKL_THREADING_LAYER')
-        os.environ['MKL_THREADING_LAYER'] = 'TBB'
-        self.module = __import__('multiprocessing.pool', globals(), locals(), self._items.keys())
-        for name in self._items.keys():
-            oldattr = getattr(self.module, name)
-            self._items[name] = oldattr
-            setattr(self.module, name, Pool)
-        return self
-
-    def __exit__(self, exc_type, exc_value, traceback):
-        import os
-        if self.env is None:
-            del os.environ['MKL_THREADING_LAYER']
-        else:
-            os.environ['MKL_THREADING_LAYER'] = self.env
-        for name in self._items.keys():
-            setattr(self.module, name, self._items[name])
-
-
-def _main():
-    # Run the module specified as the next command line argument
-    # python -m TBB user_app.py
-    del sys.argv[0]  # shift arguments
-    if len(sys.argv) < 1:
-        print("No file name specified for execution", file=sys.stderr)
-    elif '_' + sys.argv[0] in globals():
-        globals()['_' + sys.argv[0]](*sys.argv[1:])
-    else:
-        import runpy
-        with Monkey():
-            runpy.run_path(sys.argv[0], run_name='__main__')
-
-
-if __name__ == "__main__":
-    sys.exit(_main())
diff --git a/python/tbb/test.py b/python/tbb/test.py
new file mode 100644
index 0000000000..8a8238d6d5
--- /dev/null
+++ b/python/tbb/test.py
@@ -0,0 +1,199 @@
+#!/usr/bin/env python
+#
+# Copyright (c) 2016-2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#
+#
+#
+
+# Based on the software developed by:
+# Copyright (c) 2008,2016 david decotigny (Pool of threads)
+# Copyright (c) 2006-2008, R Oudkerk (multiprocessing.Pool)
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+# 3. Neither the name of author nor the names of any contributors may be
+#    used to endorse or promote products derived from this software
+#    without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+
+from __future__ import print_function
+import time
+import threading
+
+from .api import *
+from .pool import *
+
+
+def test(arg=None):
+    if arg == "-v":
+        def say(*x):
+            print(*x)
+    else:
+        def say(*x):
+            pass
+    say("Start Pool testing")
+
+    get_tid = lambda: threading.current_thread().ident
+
+    def return42():
+        return 42
+
+    def f(x):
+        return x * x
+
+    def work(mseconds):
+        res = str(mseconds)
+        if mseconds < 0:
+            mseconds = -mseconds
+        say("[%d] Start to work for %fms..." % (get_tid(), mseconds*10))
+        time.sleep(mseconds/100.)
+        say("[%d] Work done (%fms)." % (get_tid(), mseconds*10))
+        return res
+
+    ### Test copy/pasted from multiprocessing
+    pool = Pool(4)  # start worker threads
+
+    # edge cases
+    assert pool.map(return42, []) == []
+    assert pool.apply_async(return42, []).get() == 42
+    assert pool.apply(return42, []) == 42
+    assert list(pool.imap(return42, iter([]))) == []
+    assert list(pool.imap_unordered(return42, iter([]))) == []
+    assert pool.map_async(return42, []).get() == []
+    assert list(pool.imap_async(return42, iter([])).get()) == []
+    assert list(pool.imap_unordered_async(return42, iter([])).get()) == []
+
+    # basic tests
+    result = pool.apply_async(f, (10,))  # evaluate "f(10)" asynchronously
+    assert result.get(timeout=1) == 100  # ... unless slow computer
+    assert list(pool.map(f, range(10))) == list(map(f, range(10)))
+    it = pool.imap(f, range(10))
+    assert next(it) == 0
+    assert next(it) == 1
+    assert next(it) == 4
+
+    # Test apply_sync exceptions
+    result = pool.apply_async(time.sleep, (3,))
+    try:
+        say(result.get(timeout=1))  # raises `TimeoutError`
+    except TimeoutError:
+        say("Good. Got expected timeout exception.")
+    else:
+        assert False, "Expected exception !"
+    assert result.get() is None  # sleep() returns None
+
+    def cb(s):
+        say("Result ready: %s" % s)
+
+    # Test imap()
+    assert list(pool.imap(work, range(10, 3, -1), chunksize=4)) == list(map(
+        str, range(10, 3, -1)))
+
+    # Test imap_unordered()
+    assert sorted(pool.imap_unordered(work, range(10, 3, -1))) == sorted(map(
+        str, range(10, 3, -1)))
+
+    # Test map_async()
+    result = pool.map_async(work, range(10), callback=cb)
+    try:
+        result.get(timeout=0.01)  # raises `TimeoutError`
+    except TimeoutError:
+        say("Good. Got expected timeout exception.")
+    else:
+        assert False, "Expected exception !"
+    say(result.get())
+
+    # Test imap_async()
+    result = pool.imap_async(work, range(3, 10), callback=cb)
+    try:
+        result.get(timeout=0.01)  # raises `TimeoutError`
+    except TimeoutError:
+        say("Good. Got expected timeout exception.")
+    else:
+        assert False, "Expected exception !"
+    for i in result.get():
+        say("Item:", i)
+    say("### Loop again:")
+    for i in result.get():
+        say("Item2:", i)
+
+    # Test imap_unordered_async()
+    result = pool.imap_unordered_async(work, range(10, 3, -1), callback=cb)
+    try:
+        say(result.get(timeout=0.01))  # raises `TimeoutError`
+    except TimeoutError:
+        say("Good. Got expected timeout exception.")
+    else:
+        assert False, "Expected exception !"
+    for i in result.get():
+        say("Item1:", i)
+    for i in result.get():
+        say("Item2:", i)
+    r = result.get()
+    for i in r:
+        say("Item3:", i)
+    for i in r:
+        say("Item4:", i)
+    for i in r:
+        say("Item5:", i)
+
+    #
+    # The case for the exceptions
+    #
+
+    # Exceptions in imap_unordered_async()
+    result = pool.imap_unordered_async(work, range(2, -10, -1), callback=cb)
+    time.sleep(3)
+    try:
+        for i in result.get():
+            say("Got item:", i)
+    except (IOError, ValueError):
+        say("Good. Got expected exception")
+
+    # Exceptions in imap_async()
+    result = pool.imap_async(work, range(2, -10, -1), callback=cb)
+    time.sleep(3)
+    try:
+        for i in result.get():
+            say("Got item:", i)
+    except (IOError, ValueError):
+        say("Good. Got expected exception")
+
+    # Stop the test: need to stop the pool !!!
+    pool.terminate()
+    pool.join()
+
+
diff --git a/src/Makefile b/src/Makefile
index 195304d5f1..070c634b4f 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -94,8 +94,8 @@ else
 	@$(MAKE) -C "$(work_dir)_$(cfg)"  -r -f $(tbb_root)/build/Makefile.test $@
 endif
 
-python_%: tbb_release
-	bash -c ". $(work_dir)_release$(SLASH)tbbvars.sh && $(MAKE) -rC '$(full_tbb_root)/python' CXX=$(compiler) $(subst python_,,$@)"
+python_%:
+	$(MAKE) -C "$(work_dir)_release" -rf $(tbb_root)/python/Makefile $(subst python_,,$@)
 
 .PHONY: test_release test_debug test_release_no_depends test_debug_no_depends
 .PHONY: tbb_release tbb_debug tbb_test_release tbb_test_debug tbb_test_release_no_depends tbb_test_debug_no_depends
diff --git a/src/tbb/arena.cpp b/src/tbb/arena.cpp
index abe4d35bea..70ad17583c 100644
--- a/src/tbb/arena.cpp
+++ b/src/tbb/arena.cpp
@@ -641,28 +641,27 @@ void generic_scheduler::nested_arena_entry(arena* a, size_t slot_index) {
     attach_arena( a, slot_index, /*is_master*/true );
     __TBB_ASSERT( my_arena == a, NULL );
     governor::assume_scheduler( this );
-#if __TBB_ARENA_OBSERVER
-    my_last_local_observer = 0; // TODO: try optimize number of calls
-    my_arena->my_observers.notify_entry_observers( my_last_local_observer, /*worker=*/false );
-#endif
     // TODO? ITT_NOTIFY(sync_acquired, a->my_slots + index);
     // TODO: it requires market to have P workers (not P-1)
     // TODO: a preempted worker should be excluded from assignment to other arenas e.g. my_slack--
     if( !is_worker() && slot_index >= my_arena->my_num_reserved_slots )
         my_arena->my_market->adjust_demand(*my_arena, -1);
+#if __TBB_ARENA_OBSERVER
+    my_last_local_observer = 0; // TODO: try optimize number of calls
+    my_arena->my_observers.notify_entry_observers( my_last_local_observer, /*worker=*/false );
+#endif
 }
 
 void generic_scheduler::nested_arena_exit() {
-    if( !is_worker() && my_arena_index >= my_arena->my_num_reserved_slots )
-        my_arena->my_market->adjust_demand(*my_arena, 1);
 #if __TBB_ARENA_OBSERVER
     my_arena->my_observers.notify_exit_observers( my_last_local_observer, /*worker=*/false );
 #endif /* __TBB_ARENA_OBSERVER */
-
 #if __TBB_TASK_PRIORITY
     if ( my_offloaded_tasks )
         my_arena->orphan_offloaded_tasks( *this );
 #endif
+    if( !is_worker() && my_arena_index >= my_arena->my_num_reserved_slots )
+        my_arena->my_market->adjust_demand(*my_arena, 1);
     // Free the master slot.
     __TBB_ASSERT(my_arena->my_slots[my_arena_index].my_scheduler, "A slot is already empty");
     __TBB_store_with_release(my_arena->my_slots[my_arena_index].my_scheduler, (generic_scheduler*)NULL);
diff --git a/src/tbb/concurrent_vector.cpp b/src/tbb/concurrent_vector.cpp
index 4ecf09903a..bf405cf61b 100644
--- a/src/tbb/concurrent_vector.cpp
+++ b/src/tbb/concurrent_vector.cpp
@@ -43,7 +43,7 @@ using namespace std;
 namespace tbb {
 
 namespace internal {
-    class concurrent_vector_base_v3::helper :no_assign {
+class concurrent_vector_base_v3::helper :no_assign {
 public:
     //! memory page size
     static const size_type page_size = 4096;
@@ -197,7 +197,7 @@ namespace internal {
                 func( begin, n );
         }
     };
-};
+}; // class helper
 
 void concurrent_vector_base_v3::helper::extend_segment_table(concurrent_vector_base_v3 &v, concurrent_vector_base_v3::size_type start) {
     if( start > segment_size(pointers_per_short_table) ) start = segment_size(pointers_per_short_table);
diff --git a/src/tbb/condition_variable.cpp b/src/tbb/condition_variable.cpp
index 49ed609f4a..cf5b0f074d 100644
--- a/src/tbb/condition_variable.cpp
+++ b/src/tbb/condition_variable.cpp
@@ -141,8 +141,18 @@ static const dynamic_link_descriptor CondVarLinkTable[] = {
 void init_condvar_module()
 {
     __TBB_ASSERT( (uintptr_t)__TBB_init_condvar==(uintptr_t)&init_condvar_using_event, NULL );
-    if( dynamic_link( "Kernel32.dll", CondVarLinkTable, 4 ) )
+#if __TBB_WIN8UI_SUPPORT
+    // We expect condition variables to be always available for Windows* store applications,
+    // so there is no need to check presense and use alternative implementation.
+    __TBB_init_condvar = (void (WINAPI *)(PCONDITION_VARIABLE))&InitializeConditionVariable;
+    __TBB_condvar_wait = (BOOL(WINAPI *)(PCONDITION_VARIABLE, LPCRITICAL_SECTION, DWORD))&SleepConditionVariableCS;
+    __TBB_condvar_notify_one = (void (WINAPI *)(PCONDITION_VARIABLE))&WakeConditionVariable;
+    __TBB_condvar_notify_all = (void (WINAPI *)(PCONDITION_VARIABLE))&WakeAllConditionVariable;
+    __TBB_destroy_condvar = (void (WINAPI *)(PCONDITION_VARIABLE))&destroy_condvar_noop;
+#else
+    if (dynamic_link("Kernel32.dll", CondVarLinkTable, 4))
         __TBB_destroy_condvar = (void (WINAPI *)(PCONDITION_VARIABLE))&destroy_condvar_noop;
+#endif
 }
 #endif /* _WIN32||_WIN64 */
 
diff --git a/src/tbb/mac32-tbb-export.lst b/src/tbb/mac32-tbb-export.lst
index 8ce79f2dee..07d274fc5b 100644
--- a/src/tbb/mac32-tbb-export.lst
+++ b/src/tbb/mac32-tbb-export.lst
@@ -191,6 +191,15 @@ __TBB_SYMBOL( _ZN3tbb8internal33itt_store_pointer_with_release_v3EPvS1_ )
 __TBB_SYMBOL( _ZN3tbb8internal18call_itt_notify_v5EiPv )
 __TBB_SYMBOL( _ZN3tbb8internal19itt_load_pointer_v3EPKv )
 __TBB_SYMBOL( _ZN3tbb8internal20itt_set_sync_name_v3EPvPKc )
+#if __TBB_ITT_STRUCTURE_API
+__TBB_SYMBOL( _ZN3tbb8internal22itt_make_task_group_v7ENS0_15itt_domain_enumEPvyS2_yNS0_12string_indexE )
+__TBB_SYMBOL( _ZN3tbb8internal23itt_metadata_str_add_v7ENS0_15itt_domain_enumEPvyNS0_12string_indexEPKc )
+__TBB_SYMBOL( _ZN3tbb8internal19itt_relation_add_v7ENS0_15itt_domain_enumEPvyNS0_12itt_relationES2_y )
+__TBB_SYMBOL( _ZN3tbb8internal17itt_task_begin_v7ENS0_15itt_domain_enumEPvyS2_yNS0_12string_indexE )
+__TBB_SYMBOL( _ZN3tbb8internal15itt_task_end_v7ENS0_15itt_domain_enumE )
+__TBB_SYMBOL( _ZN3tbb8internal19itt_region_begin_v9ENS0_15itt_domain_enumEPvyS2_yNS0_12string_indexE )
+__TBB_SYMBOL( _ZN3tbb8internal17itt_region_end_v9ENS0_15itt_domain_enumEPvy )
+#endif
 
 // pipeline.cpp
 __TBB_SYMBOL( _ZTIN3tbb6filterE )
diff --git a/src/tbb/mac64-tbb-export.lst b/src/tbb/mac64-tbb-export.lst
index 67d3676d00..d410e69bd6 100644
--- a/src/tbb/mac64-tbb-export.lst
+++ b/src/tbb/mac64-tbb-export.lst
@@ -188,6 +188,15 @@ __TBB_SYMBOL( _ZN3tbb8internal33itt_store_pointer_with_release_v3EPvS1_ )
 __TBB_SYMBOL( _ZN3tbb8internal18call_itt_notify_v5EiPv )
 __TBB_SYMBOL( _ZN3tbb8internal19itt_load_pointer_v3EPKv )
 __TBB_SYMBOL( _ZN3tbb8internal20itt_set_sync_name_v3EPvPKc )
+#if __TBB_ITT_STRUCTURE_API
+__TBB_SYMBOL( _ZN3tbb8internal23itt_metadata_str_add_v7ENS0_15itt_domain_enumEPvyNS0_12string_indexEPKc )
+__TBB_SYMBOL( _ZN3tbb8internal22itt_make_task_group_v7ENS0_15itt_domain_enumEPvyS2_yNS0_12string_indexE )
+__TBB_SYMBOL( _ZN3tbb8internal17itt_task_begin_v7ENS0_15itt_domain_enumEPvyS2_yNS0_12string_indexE )
+__TBB_SYMBOL( _ZN3tbb8internal19itt_relation_add_v7ENS0_15itt_domain_enumEPvyNS0_12itt_relationES2_y )
+__TBB_SYMBOL( _ZN3tbb8internal15itt_task_end_v7ENS0_15itt_domain_enumE )
+__TBB_SYMBOL( _ZN3tbb8internal19itt_region_begin_v9ENS0_15itt_domain_enumEPvyS2_yNS0_12string_indexE )
+__TBB_SYMBOL( _ZN3tbb8internal17itt_region_end_v9ENS0_15itt_domain_enumEPvy )
+#endif
 
 // pipeline.cpp
 __TBB_SYMBOL( _ZTIN3tbb6filterE )
diff --git a/src/tbb/market.cpp b/src/tbb/market.cpp
index 3b6efc630e..9fb6936e24 100644
--- a/src/tbb/market.cpp
+++ b/src/tbb/market.cpp
@@ -329,7 +329,7 @@ void market::try_destroy_arena ( arena* a, uintptr_t aba_epoch ) {
     assert_market_valid();
 #if __TBB_TASK_PRIORITY
     // scan all priority levels, not only in [my_global_bottom_priority;my_global_top_priority]
-    // range, because arena to be destoyed can have no outstanding request for workers
+    // range, because arena to be destroyed can have no outstanding request for workers
     for ( int p = num_priority_levels-1; p >= 0; --p ) {
         priority_level_info &pl = my_priority_levels[p];
         arena_list_type &my_arenas = pl.arenas;
diff --git a/src/tbb/tbb_main.cpp b/src/tbb/tbb_main.cpp
index 685d28ec3d..a1fd8d964d 100644
--- a/src/tbb/tbb_main.cpp
+++ b/src/tbb/tbb_main.cpp
@@ -241,12 +241,16 @@ void DoOneTimeInitializations() {
 
 #if (_WIN32||_WIN64) && !__TBB_SOURCE_DIRECTLY_INCLUDED
 //! Windows "DllMain" that handles startup and shutdown of dynamic library.
-extern "C" bool WINAPI DllMain( HANDLE /*hinstDLL*/, DWORD reason, LPVOID /*lpvReserved*/ ) {
+extern "C" bool WINAPI DllMain( HANDLE /*hinstDLL*/, DWORD reason, LPVOID lpvReserved ) {
     switch( reason ) {
         case DLL_PROCESS_ATTACH:
             __TBB_InitOnce::add_ref();
             break;
         case DLL_PROCESS_DETACH:
+            // Since THREAD_DETACH is not called for the main thread, call auto-termination
+            // here as well - but not during process shutdown (due to risk of a deadlock).
+            if( lpvReserved==NULL ) // library unload
+                governor::terminate_auto_initialized_scheduler();
             __TBB_InitOnce::remove_ref();
             // It is assumed that InitializationDone is not set after DLL_PROCESS_DETACH,
             // and thus no race on InitializationDone is possible.
diff --git a/src/tbb/tbb_main.h b/src/tbb/tbb_main.h
index 24d77d1128..b675fe0703 100644
--- a/src/tbb/tbb_main.h
+++ b/src/tbb/tbb_main.h
@@ -22,6 +22,7 @@
 #define _TBB_tbb_main_H
 
 #include "tbb/atomic.h"
+#include "governor.h"
 
 namespace tbb {
 
@@ -73,6 +74,7 @@ class __TBB_InitOnce {
     //! Remove the initial reference to resources.
     /** This is not necessarily the last reference if other threads are still running. **/
     ~__TBB_InitOnce() {
+        governor::terminate_auto_initialized_scheduler(); // TLS dtor not called for the main thread
         remove_ref();
         // We assume that InitializationDone is not set after file-scope destructors
         // start running, and thus no race on InitializationDone is possible.
diff --git a/src/tbbmalloc/proxy.cpp b/src/tbbmalloc/proxy.cpp
index 5ef279da34..1c8e6657ea 100644
--- a/src/tbbmalloc/proxy.cpp
+++ b/src/tbbmalloc/proxy.cpp
@@ -337,13 +337,23 @@ void* __TBB_malloc_safer__aligned_realloc_##CRTLIB( void *ptr, size_t size, size
     return __TBB_malloc_safer_aligned_realloc( ptr, size, aligment, &func_ptrs );                    \
 }
 
-// Limit is 30 bytes/60 symbols per line, * can be used to match any digit in bytecodes.
-// Purpose of the pattern is to mark an instruction bound, it should consist of several
-// full instructions plus one more byte. It's not required for the patterns to be unique
-// (i.e., it's OK to have same pattern for unrelated functions).
+// Only for ucrtbase: substitution for _o_free
+void (*orig__o_free)(void*);
+void __TBB_malloc__o_free(void *ptr)
+{
+    __TBB_malloc_safer_free( ptr, orig__o_free );
+}
+
+// Size limit is MAX_PATTERN_SIZE (28) byte codes / 56 symbols per line.
+// * can be used to match any digit in byte codes.
+// # followed by several * indicate a relative address that needs to be corrected.
+// Purpose of the pattern is to mark an instruction bound; it should consist of several
+// full instructions plus one extra byte code. It's not required for the patterns
+// to be unique (i.e., it's OK to have same pattern for unrelated functions).
 // TODO: use hot patch prologues if exist
 const char* known_bytecodes[] = {
 #if _WIN64
+//  "========================================================" - 56 symbols
     "4883EC284885C974",       // release free()
     "4883EC284885C975",       // release _msize()
     "4885C974375348",         // release free() 8.0.50727.42, 10.0
@@ -354,12 +364,14 @@ const char* known_bytecodes[] = {
     "48894C24084883EC28BA",   // debug prologue
     "4C894424184889542410",   // debug _aligned_msize() 10.0
     "48894C24084883EC2848",   // debug _aligned_free 10.0
+    "488BD1488D0D#*******E9", // _o_free(), ucrtbase.dll
  #if __TBB_OVERLOAD_OLD_MSVCR
     "48895C2408574883EC3049", // release _aligned_msize 9.0
     "4883EC384885C975",       // release _msize() 9.0
     "4C8BC1488B0DA6E4040033", // an old win64 SDK
  #endif
 #else // _WIN32
+//  "========================================================" - 56 symbols
     "8BFF558BEC8B",           // multiple
     "8BFF558BEC83",           // release free() & _msize() 10.0.40219.325, _msize() ucrtbase.dll
     "8BFF558BECFF",           // release _aligned_msize ucrtbase.dll
@@ -641,8 +653,13 @@ void doMallocReplacement()
         {
             ReplaceFunctionWithStore( modules_to_replace[j].name, c_routines_to_replace[i]._func, c_routines_to_replace[i]._fptr, NULL, NULL,  c_routines_to_replace[i]._on_error );
         }
-        // ucrtbase.dll does not export operator new/delete.
-        if ( strcmp(modules_to_replace[j].name, "ucrtbase.dll") == 0 ){
+        if ( strcmp(modules_to_replace[j].name, "ucrtbase.dll") == 0 ) {
+            // If _o_free function is present and patchable, redirect it to tbbmalloc as well
+            // This prevents issues with other _o_* functions which might allocate memory with malloc
+            if ( IsPrologueKnown(GetModuleHandle("ucrtbase.dll"), "_o_free", known_bytecodes) ) {
+                ReplaceFunctionWithStore( "ucrtbase.dll", "_o_free", (FUNCPTR)__TBB_malloc__o_free, known_bytecodes, (FUNCPTR*)&orig__o_free,  FRR_FAIL );
+            }
+            // ucrtbase.dll does not export operator new/delete, so skip the rest of the loop.
             continue;
         }
 
diff --git a/src/tbbmalloc/tbb_function_replacement.cpp b/src/tbbmalloc/tbb_function_replacement.cpp
index b0157856c1..f93483286e 100644
--- a/src/tbbmalloc/tbb_function_replacement.cpp
+++ b/src/tbbmalloc/tbb_function_replacement.cpp
@@ -28,9 +28,9 @@
 #include <windows.h>
 #include <new>
 #include <stdio.h>
+#include <string.h>
 #include "tbb_function_replacement.h"
 
-#include "tbb/tbb_config.h"
 #include "tbb/tbb_stddef.h"
 #include "../tbb/tbb_assert_impl.h"
 
@@ -66,7 +66,7 @@ inline bool IsInDistance(UINT_PTR addr1, UINT_PTR addr2, __int64 dist)
  * doesn't allocate memory dynamically.
  *
  * The struct MemoryBuffer holds the data about a page in the memory used for
- * replacing functions in Intel64 where the target is too far to be replaced
+ * replacing functions in 64-bit code where the target is too far to be replaced
  * with a short jump. All the calculations of m_base and m_next are in a multiple
  * of SIZE_OF_ADDRESS (which is 8 in Win64).
  */
@@ -180,14 +180,13 @@ static MemoryProvider memProvider;
 
 // Compare opcodes from dictionary (str1) and opcodes from code (str2)
 // str1 might contain '*' to mask addresses
-// RETURN: NULL if opcodes did not match, string length of str1 on success
+// RETURN: 0 if opcodes did not match, 1 on success
 size_t compareStrings( const char *str1, const char *str2 )
 {
-   size_t str1Length = strlen(str1);
-   for (size_t i=0; i<str1Length; i++){
-       if( str1[i] != '*' && str1[i] != str2[i] ) return 0;
+   for (size_t i=0; str1[i]!=0; i++){
+       if( str1[i]!='*' && str1[i]!='#' && str1[i]!=str2[i] ) return 0;
    }
-   return str1Length;
+   return 1;
 }
 
 // Check function prologue with known prologues from the dictionary
@@ -195,13 +194,13 @@ size_t compareStrings( const char *str1, const char *str2 )
 // inpAddr - pointer to function prologue
 // Dictionary contains opcodes for several full asm instructions
 // + one opcode byte for the next asm instruction for safe address processing
-// RETURN: number of bytes for safe bytes replacement (matched_pattern/2-1)
-UINT CheckOpcodes( const char ** opcodes, void *inpAddr, bool abortOnError )
+// RETURN: 1 + the index of the matched pattern, or 0 if no match found.
+static UINT CheckOpcodes( const char ** opcodes, void *inpAddr, bool abortOnError )
 {
     static size_t opcodesStringsCount = 0;
     static size_t maxOpcodesLength = 0;
     static size_t opcodes_pointer = (size_t)opcodes;
-    char opcodeString[61];
+    char opcodeString[2*MAX_PATTERN_SIZE+1];
     size_t i;
     size_t result;
 
@@ -214,20 +213,20 @@ UINT CheckOpcodes( const char ** opcodes, void *inpAddr, bool abortOnError )
             opcodesStringsCount++;
         }
         opcodes_pointer = (size_t)opcodes;
-        __TBB_ASSERT( maxOpcodesLength < 61, "Limit is 30 opcodes/60 symbols per pattern" );
+        __TBB_ASSERT( maxOpcodesLength/2 <= MAX_PATTERN_SIZE, "Pattern exceeded the limit of 28 opcodes/56 symbols" );
     }
 
     // Translate prologue opcodes to string format to compare
-    for( i=0; i< maxOpcodesLength/2; i++ ){
+    for( i=0; i<maxOpcodesLength/2 && i<MAX_PATTERN_SIZE; ++i ){
         sprintf( opcodeString + 2*i, "%.2X", *((unsigned char*)inpAddr+i) );
     }
-    opcodeString[maxOpcodesLength] = 0;
+    opcodeString[2*i] = 0;
 
     // Compare translated opcodes with patterns
-    for( i=0; i< opcodesStringsCount; i++ ){
-        result = compareStrings( opcodes[i],opcodeString );
+    for( UINT idx=0; idx<opcodesStringsCount; ++idx ){
+        result = compareStrings( opcodes[idx],opcodeString );
         if( result )
-            return (UINT)(result/2-1);
+            return idx+1; // avoid 0 which indicates a failure
     }
     if (abortOnError) {
         // Impossibility to find opcodes in the dictionary is a serious issue,
@@ -237,11 +236,26 @@ UINT CheckOpcodes( const char ** opcodes, void *inpAddr, bool abortOnError )
     return 0;
 }
 
+// Modify offsets in original code after moving it to a trampoline.
+// We do not have more than one offset to correct in existing opcode patterns.
+static void CorrectOffset( UINT_PTR address, const char* pattern, UINT distance )
+{
+    const char* pos = strstr(pattern, "#*******");
+    if( pos ) {
+        address += (pos - pattern)/2; // compute the offset position
+        UINT value;
+        // UINT assignment is not used to avoid potential alignment issues
+        memcpy(&value, Addrint2Ptr(address), sizeof(value));
+        value += distance;
+        memcpy(Addrint2Ptr(address), &value, sizeof(value));
+    }
+}
+
 // Insert jump relative instruction to the input address
 // RETURN: the size of the trampoline or 0 on failure
-static DWORD InsertTrampoline32(void *inpAddr, void *targetAddr, const char ** opcodes, void** storedAddr)
+static DWORD InsertTrampoline32(void *inpAddr, void *targetAddr, const char* pattern, void** storedAddr)
 {
-    UINT opcodesNumber = SIZE_OF_RELJUMP;
+    size_t bytesToMove = SIZE_OF_RELJUMP;
     UINT_PTR srcAddr = Ptr2Addrint(inpAddr);
     UINT_PTR tgtAddr = Ptr2Addrint(targetAddr);
     // Check that the target fits in 32 bits
@@ -253,29 +267,24 @@ static DWORD InsertTrampoline32(void *inpAddr, void *targetAddr, const char ** o
     UCHAR *codePtr = (UCHAR *)inpAddr;
 
     if ( storedAddr ){ // If requested, store original function code
-        if ( *codePtr == 0xE9 ){ // JMP relative instruction
-            // For the special case when a system function consists of a single near jump,
-            // instead of moving it somewhere we use the target of the jump as the original function.
-            unsigned offsetInJmp = *(unsigned*)(codePtr + 1);
-            *storedAddr = (void*)(srcAddr + offsetInJmp + SIZE_OF_RELJUMP);
-        }else{
-            opcodesNumber = CheckOpcodes( opcodes, inpAddr, /*abortOnError=*/true );
-            __TBB_ASSERT_RELEASE( opcodesNumber >= SIZE_OF_RELJUMP, "Incorrect bytecode pattern?" );
-            UINT_PTR strdAddr = memProvider.GetLocation(srcAddr);
-            if (!strdAddr)
-                return 0;
-            *storedAddr = Addrint2Ptr(strdAddr);
-            // Set 'executable' flag for original instructions in the new place
-            DWORD pageFlags = PAGE_EXECUTE_READWRITE;
-            if (!VirtualProtect(*storedAddr, MAX_PROBE_SIZE, pageFlags, &pageFlags)) return 0;
-            // Copy original instructions to the new place
-            memcpy(*storedAddr, codePtr, opcodesNumber);
-            // Set jump to the code after replacement
-            offset = srcAddr - strdAddr - SIZE_OF_RELJUMP;
-            offset32 = (UINT)((offset & 0xFFFFFFFF));
-            *((UCHAR*)*storedAddr+opcodesNumber) = 0xE9;
-            memcpy(((UCHAR*)*storedAddr+opcodesNumber+1), &offset32, sizeof(offset32));
-        }
+        bytesToMove = strlen(pattern)/2-1; // The last byte matching the pattern must not be copied
+        __TBB_ASSERT_RELEASE( bytesToMove >= SIZE_OF_RELJUMP, "Incorrect bytecode pattern?" );
+        UINT_PTR trampAddr = memProvider.GetLocation(srcAddr);
+        if (!trampAddr)
+            return 0;
+        *storedAddr = Addrint2Ptr(trampAddr);
+        // Set 'executable' flag for original instructions in the new place
+        DWORD pageFlags = PAGE_EXECUTE_READWRITE;
+        if (!VirtualProtect(*storedAddr, MAX_PROBE_SIZE, pageFlags, &pageFlags)) return 0;
+        // Copy original instructions to the new place
+        memcpy(*storedAddr, codePtr, bytesToMove);
+        offset = srcAddr - trampAddr;
+        offset32 = (UINT)(offset & 0xFFFFFFFF);
+        CorrectOffset( trampAddr, pattern, offset32 );
+        // Set jump to the code after replacement
+        offset32 -= SIZE_OF_RELJUMP;
+        *(UCHAR*)(trampAddr+bytesToMove) = 0xE9;
+        memcpy((UCHAR*)(trampAddr+bytesToMove+1), &offset32, sizeof(offset32));
     }
 
     // The following will work correctly even if srcAddr>tgtAddr, as long as
@@ -287,7 +296,7 @@ static DWORD InsertTrampoline32(void *inpAddr, void *targetAddr, const char ** o
     memcpy(codePtr+1, &offset32, sizeof(offset32));
 
     // Fill the rest with NOPs to correctly see disassembler of old code in debugger.
-    for( unsigned i=SIZE_OF_RELJUMP; i<opcodesNumber; i++ ){
+    for( unsigned i=SIZE_OF_RELJUMP; i<bytesToMove; i++ ){
         *(codePtr+i) = 0x90;
     }
 
@@ -299,9 +308,9 @@ static DWORD InsertTrampoline32(void *inpAddr, void *targetAddr, const char ** o
 // 2  Put jump RIP relative indirect through the address in the close page
 // 3  Put the absolute address of the target in the allocated location
 // RETURN: the size of the trampoline or 0 on failure
-static DWORD InsertTrampoline64(void *inpAddr, void *targetAddr, const char ** opcodes, void** storedAddr)
+static DWORD InsertTrampoline64(void *inpAddr, void *targetAddr, const char* pattern, void** storedAddr)
 {
-    UINT opcodesNumber = SIZE_OF_INDJUMP;
+    size_t bytesToMove = SIZE_OF_INDJUMP;
 
     UINT_PTR srcAddr = Ptr2Addrint(inpAddr);
     UINT_PTR tgtAddr = Ptr2Addrint(targetAddr);
@@ -320,29 +329,24 @@ static DWORD InsertTrampoline64(void *inpAddr, void *targetAddr, const char ** o
     *locPtr = tgtAddr;
 
     if ( storedAddr ){ // If requested, store original function code
-        if ( *codePtr == 0xE9 ){ // JMP relative instruction
-            // For the special case when a system function consists of a single near jump,
-            // instead of moving it somewhere we use the target of the jump as the original function.
-            unsigned offsetInJmp = *(unsigned*)(codePtr + 1);
-            *storedAddr = (void*)(srcAddr + offsetInJmp + SIZE_OF_RELJUMP);
-        }else{
-            opcodesNumber = CheckOpcodes( opcodes, inpAddr, /*abortOnError=*/true );
-            __TBB_ASSERT_RELEASE( opcodesNumber >= SIZE_OF_INDJUMP, "Incorrect bytecode pattern?" );
-            UINT_PTR strdAddr = memProvider.GetLocation(srcAddr);
-            if (!strdAddr)
-                return 0;
-            *storedAddr = Addrint2Ptr(strdAddr);
-            // Set 'executable' flag for original instructions in the new place
-            DWORD pageFlags = PAGE_EXECUTE_READWRITE;
-            if (!VirtualProtect(*storedAddr, MAX_PROBE_SIZE, pageFlags, &pageFlags)) return 0;
-            // Copy original instructions to the new place
-            memcpy(*storedAddr, codePtr, opcodesNumber);
-            // Set jump to the code after replacement. It is within the distance of relative jump!
-            offset = srcAddr - strdAddr - SIZE_OF_RELJUMP;
-            offset32 = (UINT)((offset & 0xFFFFFFFF));
-            *((UCHAR*)*storedAddr+opcodesNumber) = 0xE9;
-            memcpy(((UCHAR*)*storedAddr+opcodesNumber+1), &offset32, sizeof(offset32));
-        }
+        bytesToMove = strlen(pattern)/2-1; // The last byte matching the pattern must not be copied
+        __TBB_ASSERT_RELEASE( bytesToMove >= SIZE_OF_INDJUMP, "Incorrect bytecode pattern?" );
+        UINT_PTR trampAddr = memProvider.GetLocation(srcAddr);
+        if (!trampAddr)
+            return 0;
+        *storedAddr = Addrint2Ptr(trampAddr);
+        // Set 'executable' flag for original instructions in the new place
+        DWORD pageFlags = PAGE_EXECUTE_READWRITE;
+        if (!VirtualProtect(*storedAddr, MAX_PROBE_SIZE, pageFlags, &pageFlags)) return 0;
+        // Copy original instructions to the new place
+        memcpy(*storedAddr, codePtr, bytesToMove);
+        offset = srcAddr - trampAddr;
+        offset32 = (UINT)(offset & 0xFFFFFFFF);
+        CorrectOffset( trampAddr, pattern, offset32 );
+        // Set jump to the code after replacement. It is within the distance of relative jump!
+        offset32 -= SIZE_OF_RELJUMP;
+        *(UCHAR*)(trampAddr+bytesToMove) = 0xE9;
+        memcpy((UCHAR*)(trampAddr+bytesToMove+1), &offset32, sizeof(offset32));
     }
 
     // Fill the buffer
@@ -353,7 +357,7 @@ static DWORD InsertTrampoline64(void *inpAddr, void *targetAddr, const char ** o
     memcpy(codePtr+2, &offset32, sizeof(offset32));
 
     // Fill the rest with NOPs to correctly see disassembler of old code in debugger.
-    for( unsigned i=SIZE_OF_INDJUMP; i<opcodesNumber; i++ ){
+    for( unsigned i=SIZE_OF_INDJUMP; i<bytesToMove; i++ ){
         *(codePtr+i) = 0x90;
     }
 
@@ -373,9 +377,27 @@ static bool InsertTrampoline(void *inpAddr, void *targetAddr, const char ** opco
     DWORD origProt = 0;
     if (!VirtualProtect(inpAddr, MAX_PROBE_SIZE, PAGE_EXECUTE_WRITECOPY, &origProt))
         return FALSE;
-    probeSize = InsertTrampoline32(inpAddr, targetAddr, opcodes, origFunc);
+
+    UINT opcodeIdx = 0;
+    if ( origFunc ){ // Need to store original function code
+        UCHAR * const codePtr = (UCHAR *)inpAddr;
+        if ( *codePtr == 0xE9 ){ // JMP relative instruction
+            // For the special case when a system function consists of a single near jump,
+            // instead of moving it somewhere we use the target of the jump as the original function.
+            unsigned offsetInJmp = *(unsigned*)(codePtr + 1);
+            *origFunc = (void*)(Ptr2Addrint(inpAddr) + offsetInJmp + SIZE_OF_RELJUMP);
+            origFunc = NULL; // now it must be ignored by InsertTrampoline32/64
+        } else {
+            // find the right opcode pattern
+            opcodeIdx = CheckOpcodes( opcodes, inpAddr, /*abortOnError=*/true );
+            __TBB_ASSERT( opcodeIdx > 0, "abortOnError ignored in CheckOpcodes?" );
+        }
+    }
+
+    const char* pattern = opcodeIdx>0? opcodes[opcodeIdx-1]: NULL; // -1 compensates for +1 in CheckOpcodes
+    probeSize = InsertTrampoline32(inpAddr, targetAddr, pattern, origFunc);
     if (!probeSize)
-        probeSize = InsertTrampoline64(inpAddr, targetAddr, opcodes, origFunc);
+        probeSize = InsertTrampoline64(inpAddr, targetAddr, pattern, origFunc);
 
     // Restore original protection
     VirtualProtect(inpAddr, MAX_PROBE_SIZE, origProt, &origProt);
diff --git a/src/tbbmalloc/tbb_function_replacement.h b/src/tbbmalloc/tbb_function_replacement.h
index e986ab18b2..3595667004 100644
--- a/src/tbbmalloc/tbb_function_replacement.h
+++ b/src/tbbmalloc/tbb_function_replacement.h
@@ -56,7 +56,7 @@ union Int2Ptr {
 inline UINT_PTR Ptr2Addrint(LPVOID ptr);
 inline LPVOID Addrint2Ptr(UINT_PTR ptr);
 
-// Use this value as the maximum size the trampoline region
+// The size of a trampoline region
 const unsigned MAX_PROBE_SIZE = 32;
 
 // The size of a jump relative instruction "e9 00 00 00 00"
@@ -68,6 +68,10 @@ const unsigned SIZE_OF_INDJUMP = 6;
 // The size of address we put in the location (in Intel64)
 const unsigned SIZE_OF_ADDRESS = 8;
 
+// The size limit (in bytes) for an opcode pattern to fit into a trampoline
+// There should be enough space left for a relative jump; +1 is for the extra pattern byte.
+const unsigned MAX_PATTERN_SIZE = MAX_PROBE_SIZE - SIZE_OF_RELJUMP + 1;
+
 // The max distance covered in 32 bits: 2^31 - 1 - C
 // where C should not be smaller than the size of a probe.
 // The latter is important to correctly handle "backward" jumps.
diff --git a/src/test/harness_allocator.h b/src/test/harness_allocator.h
index b0ecc6196c..b013ccdc97 100644
--- a/src/test/harness_allocator.h
+++ b/src/test/harness_allocator.h
@@ -127,7 +127,7 @@ struct arena {
     //! Allocate space for n objects, starting on a cache/sector line.
     pointer allocate( size_type n, const void* =0) {
         size_t new_size = (my_data->my_allocated += n*sizeof(T));
-        __TBB_ASSERT(my_data->my_allocated <= my_data->my_size,"trying to allocate more than was reserved");
+        ASSERT(my_data->my_allocated <= my_data->my_size,"trying to allocate more than was reserved");
         char* result =  &(my_data->my_buffer[new_size - n*sizeof(T)]);
         return reinterpret_cast<pointer>(result);
     }
@@ -135,8 +135,8 @@ struct arena {
     //! Free block of memory that starts on a cache line
     void deallocate( pointer p_arg, size_type n) {
         char* p = reinterpret_cast<char*>(p_arg);
-        __TBB_ASSERT(p >=my_data->my_buffer && p <= my_data->my_buffer + my_data->my_size, "trying to deallocate pointer not from arena ?");
-        __TBB_ASSERT(p + n*sizeof(T) <= my_data->my_buffer + my_data->my_size, "trying to deallocate incorrect number of items?");
+        ASSERT(p >=my_data->my_buffer && p <= my_data->my_buffer + my_data->my_size, "trying to deallocate pointer not from arena ?");
+        ASSERT(p + n*sizeof(T) <= my_data->my_buffer + my_data->my_size, "trying to deallocate incorrect number of items?");
         tbb::internal::suppress_unused_warning(p, n);
     }
 
diff --git a/src/test/harness_defs.h b/src/test/harness_defs.h
index 26bd9055f3..369c70ac75 100644
--- a/src/test/harness_defs.h
+++ b/src/test/harness_defs.h
@@ -175,10 +175,18 @@
     #endif
 #endif
 
-#ifndef TBB_PREVIEW_FLOW_GRAPH_FEATURES
-    #if __TBB_CPF_BUILD
+#if __TBB_CPF_BUILD
+    #ifndef  TBB_PREVIEW_FLOW_GRAPH_FEATURES
         #define TBB_PREVIEW_FLOW_GRAPH_FEATURES 1
     #endif
+    #if __TBB_ITT_STRUCTURE_API
+        #ifndef TBB_PREVIEW_FLOW_GRAPH_TRACE
+            #define TBB_PREVIEW_FLOW_GRAPH_TRACE 1
+        #endif
+        #ifndef TBB_PREVIEW_ALGORITHM_TRACE
+            #define TBB_PREVIEW_ALGORITHM_TRACE 1
+        #endif
+    #endif
 #endif
 
 // std::is_copy_constructible<T>::value returns 'true' for non copyable type when MSVC compiler is used.
diff --git a/src/test/test_blocked_range2d.cpp b/src/test/test_blocked_range2d.cpp
index b2a0ae97fd..1ebc52edac 100644
--- a/src/test/test_blocked_range2d.cpp
+++ b/src/test/test_blocked_range2d.cpp
@@ -68,25 +68,25 @@ static void SerialTest() {
     typedef AbstractValueType<RowTag> row_type;
     typedef AbstractValueType<ColTag> col_type;
     typedef tbb::blocked_range2d<row_type,col_type> range_type;
-    for( int rowx=-10; rowx<10; ++rowx ) {
-        for( int rowy=rowx; rowy<10; ++rowy ) {
-            row_type rowi = MakeAbstractValueType<RowTag>(rowx);
-            row_type rowj = MakeAbstractValueType<RowTag>(rowy);
-            for( int rowg=1; rowg<10; ++rowg ) {
-                for( int colx=-10; colx<10; ++colx ) {
-                    for( int coly=colx; coly<10; ++coly ) {
-                        col_type coli = MakeAbstractValueType<ColTag>(colx);
-                        col_type colj = MakeAbstractValueType<ColTag>(coly);
-                        for( int colg=1; colg<10; ++colg ) {
-                            range_type r( rowi, rowj, rowg, coli, colj, colg );
+    for( int row_x=-10; row_x<10; ++row_x ) {
+        for( int row_y=row_x; row_y<10; ++row_y ) {
+            row_type row_i = MakeAbstractValueType<RowTag>(row_x);
+            row_type row_j = MakeAbstractValueType<RowTag>(row_y);
+            for( int row_grain=1; row_grain<10; ++row_grain ) {
+                for( int col_x=-10; col_x<10; ++col_x ) {
+                    for( int col_y=col_x; col_y<10; ++col_y ) {
+                        col_type col_i = MakeAbstractValueType<ColTag>(col_x);
+                        col_type col_j = MakeAbstractValueType<ColTag>(col_y);
+                        for( int col_grain=1; col_grain<10; ++col_grain ) {
+                            range_type r( row_i, row_j, row_grain, col_i, col_j, col_grain );
                             AssertSameType( r.is_divisible(), true );
                             AssertSameType( r.empty(), true );
                             AssertSameType( static_cast<range_type::row_range_type::const_iterator*>(0), static_cast<row_type*>(0) );
                             AssertSameType( static_cast<range_type::col_range_type::const_iterator*>(0), static_cast<col_type*>(0) );
-                            AssertSameType( r.rows(), tbb::blocked_range<row_type>( rowi, rowj, 1 ));
-                            AssertSameType( r.cols(), tbb::blocked_range<col_type>( coli, colj, 1 ));
-                            ASSERT( r.empty()==(rowx==rowy||colx==coly), NULL );
-                            ASSERT( r.is_divisible()==(rowy-rowx>rowg||coly-colx>colg), NULL );
+                            AssertSameType( r.rows(), tbb::blocked_range<row_type>( row_i, row_j, 1 ));
+                            AssertSameType( r.cols(), tbb::blocked_range<col_type>( col_i, col_j, 1 ));
+                            ASSERT( r.empty()==(row_x==row_y||col_x==col_y), NULL );
+                            ASSERT( r.is_divisible()==(row_y-row_x>row_grain||col_y-col_x>col_grain), NULL );
                             if( r.is_divisible() ) {
                                 range_type r2(r,tbb::split());
                                 if( GetValueOf(r2.rows().begin())==GetValueOf(r.rows().begin()) ) {
diff --git a/src/test/test_blocked_range3d.cpp b/src/test/test_blocked_range3d.cpp
index aea20223f9..242fcfd97e 100644
--- a/src/test/test_blocked_range3d.cpp
+++ b/src/test/test_blocked_range3d.cpp
@@ -70,22 +70,22 @@ static void SerialTest() {
     typedef AbstractValueType<RowTag> row_type;
     typedef AbstractValueType<ColTag> col_type;
     typedef tbb::blocked_range3d<page_type,row_type,col_type> range_type;
-    for( int pagex=-4; pagex<4; ++pagex ) {
-        for( int pagey=pagex; pagey<4; ++pagey ) {
-            page_type pagei = MakeAbstractValueType<PageTag>(pagex);
-            page_type pagej = MakeAbstractValueType<PageTag>(pagey);
-            for( int pageg=1; pageg<4; ++pageg ) {
-                for( int rowx=-4; rowx<4; ++rowx ) {
-                    for( int rowy=rowx; rowy<4; ++rowy ) {
-                        row_type rowi = MakeAbstractValueType<RowTag>(rowx);
-                        row_type rowj = MakeAbstractValueType<RowTag>(rowy);
-                        for( int rowg=1; rowg<4; ++rowg ) {
-                            for( int colx=-4; colx<4; ++colx ) {
-                                for( int coly=colx; coly<4; ++coly ) {
-                                    col_type coli = MakeAbstractValueType<ColTag>(colx);
-                                    col_type colj = MakeAbstractValueType<ColTag>(coly);
-                                    for( int colg=1; colg<4; ++colg ) {
-                                        range_type r( pagei, pagej, pageg, rowi, rowj, rowg, coli, colj, colg );
+    for( int page_x=-4; page_x<4; ++page_x ) {
+        for( int page_y=page_x; page_y<4; ++page_y ) {
+            page_type page_i = MakeAbstractValueType<PageTag>(page_x);
+            page_type page_j = MakeAbstractValueType<PageTag>(page_y);
+            for( int page_grain=1; page_grain<4; ++page_grain ) {
+                for( int row_x=-4; row_x<4; ++row_x ) {
+                    for( int row_y=row_x; row_y<4; ++row_y ) {
+                        row_type row_i = MakeAbstractValueType<RowTag>(row_x);
+                        row_type row_j = MakeAbstractValueType<RowTag>(row_y);
+                        for( int row_grain=1; row_grain<4; ++row_grain ) {
+                            for( int col_x=-4; col_x<4; ++col_x ) {
+                                for( int col_y=col_x; col_y<4; ++col_y ) {
+                                    col_type col_i = MakeAbstractValueType<ColTag>(col_x);
+                                    col_type col_j = MakeAbstractValueType<ColTag>(col_y);
+                                    for( int col_grain=1; col_grain<4; ++col_grain ) {
+                                        range_type r( page_i, page_j, page_grain, row_i, row_j, row_grain, col_i, col_j, col_grain );
                                         AssertSameType( r.is_divisible(), true );
 
                                         AssertSameType( r.empty(), true );
@@ -94,13 +94,13 @@ static void SerialTest() {
                                         AssertSameType( static_cast<range_type::row_range_type::const_iterator*>(0), static_cast<row_type*>(0) );
                                         AssertSameType( static_cast<range_type::col_range_type::const_iterator*>(0), static_cast<col_type*>(0) );
 
-                                        AssertSameType( r.pages(), tbb::blocked_range<page_type>( pagei, pagej, 1 ));
-                                        AssertSameType( r.rows(), tbb::blocked_range<row_type>( rowi, rowj, 1 ));
-                                        AssertSameType( r.cols(), tbb::blocked_range<col_type>( coli, colj, 1 ));
+                                        AssertSameType( r.pages(), tbb::blocked_range<page_type>( page_i, page_j, 1 ));
+                                        AssertSameType( r.rows(), tbb::blocked_range<row_type>( row_i, row_j, 1 ));
+                                        AssertSameType( r.cols(), tbb::blocked_range<col_type>( col_i, col_j, 1 ));
 
-                                        ASSERT( r.empty()==(pagex==pagey||rowx==rowy||colx==coly), NULL );
+                                        ASSERT( r.empty()==(page_x==page_y||row_x==row_y||col_x==col_y), NULL );
 
-                                        ASSERT( r.is_divisible()==(pagey-pagex>pageg||rowy-rowx>rowg||coly-colx>colg), NULL );
+                                        ASSERT( r.is_divisible()==(page_y-page_x>page_grain||row_y-row_x>row_grain||col_y-col_x>col_grain), NULL );
 
                                         if( r.is_divisible() ) {
                                             range_type r2(r,tbb::split());
diff --git a/src/test/test_concurrent_hash_map.cpp b/src/test/test_concurrent_hash_map.cpp
index 03c281af94..a4a376b638 100644
--- a/src/test/test_concurrent_hash_map.cpp
+++ b/src/test/test_concurrent_hash_map.cpp
@@ -208,7 +208,6 @@ typedef local_counting_allocator<std::allocator<MyData> > MyAllocator;
 typedef tbb::concurrent_hash_map<MyKey,MyData,MyHashCompare,MyAllocator> MyTable;
 typedef tbb::concurrent_hash_map<MyKey,MyData2,MyHashCompare> MyTable2;
 typedef tbb::concurrent_hash_map<MyKey,MyData,YourHashCompare> YourTable;
-typedef tbb::concurrent_hash_map<MyKey,MyData,MyHashCompare,MyAllocator> MyTable;
 
 template<typename MyTable>
 inline void CheckAllocator(MyTable &table, size_t expected_allocs, size_t expected_frees, bool exact = true) {
diff --git a/src/test/test_opencl_node.cpp b/src/test/test_opencl_node.cpp
index 7bb3149e0e..03a81ccc4d 100644
--- a/src/test/test_opencl_node.cpp
+++ b/src/test/test_opencl_node.cpp
@@ -477,7 +477,7 @@ class ConcurrencyTestBody : NoAssign {
             for ( int i = 0; i < numChecks; i += 2 ) {
                 for ( int j = 0; j < 2; ++j ) {
                     opencl_buffer<cl_char, Factory> b1( f, N );
-                    std::fill( b1.begin(), b1.end(), 1 );
+                    std::fill( b1.begin(), b1.end(), cl_char(1) );
                     input_port<0>( *n2 ).try_put( b1 );
                 }
 
@@ -485,12 +485,12 @@ class ConcurrencyTestBody : NoAssign {
                 opencl_buffer<cl_short, Factory> b( f, 4*N );
                 size_t id0 = (rnd.get() % N) & alignmentMask;
                 opencl_subbuffer<cl_short, Factory> sb1( b, id0, N );
-                std::fill( sb1.begin(), sb1.end(), 0 );
+                std::fill( sb1.begin(), sb1.end(), cl_short(0) );
                 input_port<1>( *n2 ).try_put( sb1 );
 
                 size_t id1 = (rnd.get() % N) & alignmentMask;
                 opencl_subbuffer<cl_short, Factory> sb2 = b.subbuffer( 2*N + id1, N );
-                std::fill( sb2.begin(), sb2.end(), 0 );
+                std::fill( sb2.begin(), sb2.end(), cl_short(0) );
                 input_port<1>( *n2 ).try_put( sb2 );
             }
         } else {
@@ -498,7 +498,7 @@ class ConcurrencyTestBody : NoAssign {
             // output_port<1> of the previous node.
             for ( int i = 0; i < numChecks; ++i ) {
                 opencl_buffer<cl_char, Factory> b( f, N );
-                std::fill( b.begin(), b.end(), 1 );
+                std::fill( b.begin(), b.end(), cl_char(1) );
                 input_port<0>( *n2 ).try_put( b );
             }
         }
diff --git a/src/test/test_partitioner.h b/src/test/test_partitioner.h
index 40ec34db05..b3da6cbf4d 100644
--- a/src/test/test_partitioner.h
+++ b/src/test/test_partitioner.h
@@ -29,6 +29,7 @@
 #endif
 #include "tbb/tbb_stddef.h"
 #include "harness.h"
+#include <vector>
 
 namespace test_partitioner_utils {
 
@@ -346,6 +347,9 @@ class BinaryTree {
         visualize_node(m_root);
     }
 
+    bool operator ==(const BinaryTree& other_tree) const { return compare_nodes(m_root, other_tree.m_root); }
+    void fill_leafs(std::vector<TreeNode*>& leafs) const { fill_leafs_impl(m_root, leafs); }
+
 private:
     TreeNode *m_root;
 
@@ -431,6 +435,20 @@ class BinaryTree {
         if (node->m_right)
             visualize_node(node->m_right, indent + 1);
     }
+
+    bool compare_nodes(TreeNode* node1, TreeNode* node2) const {
+        if (node1 == NULL && node2 == NULL) return true;
+        if (node1 == NULL || node2 == NULL) return false;
+        return are_nodes_equal(node1, node2) && compare_nodes(node1->m_left, node2->m_left)
+            && compare_nodes(node1->m_right, node2->m_right);
+    }
+
+    void fill_leafs_impl(TreeNode* node, std::vector<TreeNode*>& leafs) const {
+        if (node->m_left == NULL && node->m_right == NULL)
+            leafs.push_back(node);
+        if (node->m_left != NULL) fill_leafs_impl(node->m_left, leafs);
+        if (node->m_right != NULL) fill_leafs_impl(node->m_right, leafs);
+    }
 };
 
 class SimpleBody {
diff --git a/src/test/test_partitioner_whitebox.cpp b/src/test/test_partitioner_whitebox.cpp
index 80026eb66c..35eaddea88 100644
--- a/src/test/test_partitioner_whitebox.cpp
+++ b/src/test/test_partitioner_whitebox.cpp
@@ -145,5 +145,7 @@ class ParallelBody: public ParallelTestBody {
 int TestMain() {
     uniform_iterations_distribution::test<ParallelBody <tbb::affinity_partitioner> >();
     uniform_iterations_distribution::test<ParallelBody <tbb::static_partitioner> >();
+    uniform_iterations_distribution::test_task_affinity<tbb::affinity_partitioner>();
+    uniform_iterations_distribution::test_task_affinity<tbb::static_partitioner>();
     return Harness::Done;
 }
diff --git a/src/test/test_partitioner_whitebox.h b/src/test/test_partitioner_whitebox.h
index a4cc44d4a0..aaa2ec64bb 100644
--- a/src/test/test_partitioner_whitebox.h
+++ b/src/test/test_partitioner_whitebox.h
@@ -28,6 +28,7 @@
 #include "string.h"
 #include "harness_assert.h"
 #include "test_partitioner.h"
+#include <numeric>
 
 #if TBB_USE_DEBUG
 // reducing number of simulations due to test timeout
@@ -40,6 +41,11 @@ typedef tbb::enumerable_thread_specific<size_t> ThreadNumsType;
 size_t g_threadNumInitialValue = 10;
 ThreadNumsType g_threadNums(g_threadNumInitialValue);
 
+namespace whitebox_simulation {
+size_t whitebox_thread_index = 0;
+test_partitioner_utils::BinaryTree reference_tree;
+}
+
 // simulate a subset of task.h
 namespace tbb {
 namespace internal {
@@ -64,9 +70,16 @@ class fake_task {
     fake_task *my_parent;
     affinity_id my_affinity;
 };
+namespace task_arena {
+static const int not_initialized = -2;//should match corresponding value in task_arena.h
+}//namespace task_arena
+namespace this_task_arena {
+inline int current_thread_index() { return (int)whitebox_simulation::whitebox_thread_index; }
 }
+}//namespace tbb
 
 #define __TBB_task_H
+#define __TBB_task_arena_H
 #define get_initial_auto_partitioner_divisor my_get_initial_auto_partitioner_divisor
 #define affinity_partitioner_base_v3 my_affinity_partitioner_base_v3
 #define task fake_task
@@ -393,4 +406,66 @@ void test() {
         ParallelTestBody(parallel_group_thread_starting_index));
 }
 
+namespace task_affinity_whitebox {
+size_t range_begin = 0;
+size_t range_end = 20;
+}
+
+template<typename Partitioner>
+void check_tree(const test_partitioner_utils::BinaryTree&);
+
+template<>
+void check_tree<tbb::affinity_partitioner>(const test_partitioner_utils::BinaryTree& tree) {
+    ASSERT(tree == whitebox_simulation::reference_tree,
+        "affinity_partitioner distributes tasks differently from run to run");
+}
+
+template<>
+void check_tree<tbb::static_partitioner>(const test_partitioner_utils::BinaryTree& tree) {
+    std::vector<test_partitioner_utils::TreeNode* > tree_leafs;
+    tree.fill_leafs(tree_leafs);
+    typedef std::vector<size_t> Slots;
+    Slots affinity_slots(tree_leafs.size() + 1, 0);
+
+    for (std::vector<test_partitioner_utils::TreeNode*>::iterator i = tree_leafs.begin(); i != tree_leafs.end(); ++i) {
+        affinity_slots[(*i)->m_affinity]++;
+        if ((*i)->m_affinity == 0)
+            ASSERT((*i)->m_range_begin == task_affinity_whitebox::range_begin,
+                "Task with affinity 0 was executed with wrong range");
+    }
+
+    typedef std::iterator_traits<Slots::iterator>::difference_type slots_difference_type;
+    ASSERT(std::count(affinity_slots.begin(), affinity_slots.end(), size_t(0)) == slots_difference_type(1),
+        "static_partitioner incorrectly distributed tasks by threads");
+    ASSERT(std::count(affinity_slots.begin(), affinity_slots.end(), size_t(1)) == slots_difference_type(g_threadNums.local()),
+        "static_partitioner incorrectly distributed tasks by threads");
+    ASSERT(affinity_slots[tbb::this_task_arena::current_thread_index() + 1] == 0,
+        "static_partitioner incorrectly assigns task with 0 affinity");
+    ASSERT(std::accumulate(affinity_slots.begin(), affinity_slots.end(), size_t(0)) == g_threadNums.local(),
+        "static_partitioner has created more tasks than the number of threads");
+}
+
+template<typename Partitioner>
+void test_task_affinity() {
+    using namespace task_affinity_whitebox;
+    test_partitioner_utils::SimpleBody body;
+    for (size_t p = 1; p <= 50; ++p) {
+        g_threadNums.local() = p;
+        whitebox_simulation::whitebox_thread_index = 0;
+        test_partitioner_utils::TestRanges::BlockedRange range(range_begin, range_end, /*statData*/NULL,
+                                            /*provide_feedback*/false, /*ensure_non_empty_size*/false);
+        Partitioner partitioner;
+        whitebox_simulation::reference_tree = test_partitioner_utils::BinaryTree();
+        whitebox_simulation::parallel_for(range, body, partitioner, &(whitebox_simulation::reference_tree));
+        while (whitebox_simulation::whitebox_thread_index < p) {
+            test_partitioner_utils::BinaryTree tree;
+            whitebox_simulation::parallel_for(range, body, partitioner, &tree);
+            check_tree<Partitioner>(tree);
+            whitebox_simulation::whitebox_thread_index++;
+        }
+        range_begin++;
+        range_end += 2;
+    }
+}
+
 } /* namespace uniform_iterations_distribution */
diff --git a/src/test/test_tbb_version.cpp b/src/test/test_tbb_version.cpp
index bc6606ed19..62f788241d 100644
--- a/src/test/test_tbb_version.cpp
+++ b/src/test/test_tbb_version.cpp
@@ -229,7 +229,7 @@ int main(int argc, char *argv[] ) {
 void initialize_strings_vector(std::vector <string_pair>* vector)
 {
     vector->push_back(string_pair("TBB: VERSION\t\t2018.0", required));       // check TBB_VERSION
-    vector->push_back(string_pair("TBB: INTERFACE VERSION\t10001", required)); // check TBB_INTERFACE_VERSION
+    vector->push_back(string_pair("TBB: INTERFACE VERSION\t10002", required)); // check TBB_INTERFACE_VERSION
     vector->push_back(string_pair("TBB: BUILD_DATE", required));
     vector->push_back(string_pair("TBB: BUILD_HOST", required));
     vector->push_back(string_pair("TBB: BUILD_OS", required));