diff --git a/External/CMakeLists.txt b/External/CMakeLists.txt
index 8afa070a7c..6b03602098 100644
--- a/External/CMakeLists.txt
+++ b/External/CMakeLists.txt
@@ -10,3 +10,4 @@ add_subdirectory(ffmpeg)
 add_subdirectory(skidmarks10)
 add_subdirectory(sollve_vv)
 add_subdirectory(smoke)
+add_subdirectory(OpenCL)
diff --git a/External/OpenCL/CMakeLists.txt b/External/OpenCL/CMakeLists.txt
new file mode 100644
index 0000000000..a039c7667b
--- /dev/null
+++ b/External/OpenCL/CMakeLists.txt
@@ -0,0 +1,49 @@
+include(External)
+
+llvm_externals_find(TEST_SUITE_OPENCL_ROOT "opencl" "OpenCL prerequisites")
+message(STATUS "TEST_SUITE_OPENCL_ROOT: ${TEST_SUITE_OPENCL_ROOT}")
+get_filename_component(OPENCL_CLANG_PATH ${CMAKE_CXX_COMPILER} DIRECTORY)
+string(REGEX REPLACE "bin$" "lib" OPENCL_LIB_PATH "${OPENCL_CLANG_PATH}")
+message(STATUS "OPENCL_CLANG_PATH: ${OPENCL_CLANG_PATH}")
+message(STATUS "OPENCL_LIB_PATH: ${OPENCL_LIB_PATH}")
+
+macro(create_local_test TestName TestSources TestData VariantCPPFLAGS VariantLDFLAGS VariantLibs)
+    set(_sources "${TestSources}")
+    set(_executable ${TestName})
+    set(_data "${TestData}")
+    llvm_test_run(WORKDIR %S
+      EXECUTABLE "LD_LIBRARY_PATH=${OPENCL_LIB_PATH}" ./${_executable} > %o
+    )
+    set(REFERENCE_OUTPUT)
+    # Verify reference output if it exists.
+    if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TestName}.reference_output)
+      set(REFERENCE_OUTPUT ${TestName}.reference_output)
+      llvm_test_verify(WORKDIR %S
+        %b/${FPCMP} %o ${REFERENCE_OUTPUT}
+      )
+      llvm_test_executable(${_executable} "${_sources}")
+      llvm_test_data(${_executable} ${_data} ${REFERENCE_OUTPUT})
+    else()
+      llvm_test_executable(${_executable} "${_sources}")
+      llvm_test_data(${_executable} ${_data})
+    endif()
+    target_compile_options(${_executable} PUBLIC ${VariantCPPFLAGS})
+    target_link_options(${_executable} PUBLIC ${VariantLDFLAGS})
+    if(VariantLibs)
+      target_link_libraries(${_executable} ${VariantLibs})
+    endif()
+    add_dependencies(opencl-tests-simple-${TestName} ${_executable})
+endmacro()
+
+if(TEST_SUITE_OPENCL_ROOT)
+  add_custom_target(opencl-tests-simple COMMENT "Build all simple OpenCL tests")
+  # Add common OpenCL related flags
+  list(APPEND LDFLAGS -lOpenCL -L${OPENCL_LIB_PATH})
+  add_subdirectory(tests)
+  message(STATUS "OPENCL_SIMPLE_TESTS_LIST: ${OPENCL_SIMPLE_TESTS_LIST}")
+  add_custom_target(check-opencl-simple COMMENT "Run all simple OpenCL tests"
+    COMMAND ${TEST_SUITE_LIT} ${TEST_SUITE_LIT_FLAGS} ${OPENCL_SIMPLE_TESTS_LIST}
+    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+    DEPENDS opencl-tests-simple
+    USES_TERMINAL)
+endif()
diff --git a/External/OpenCL/README b/External/OpenCL/README
new file mode 100644
index 0000000000..3b6745a00f
--- /dev/null
+++ b/External/OpenCL/README
@@ -0,0 +1,39 @@
+OpenCL Tests
+==========
+
+OpenCL tests are enabled if cmake is invoked with
+-DTEST_SUITE_EXTERNALS_DIR=<externals path> and specified externals directory contains at least one ROCm installation.
+
+Expected externals directory structure:
+Externals/
+  opencl/
+    opencl[-version]/ -- OpenCL compiler installation, which should, at the very least, have the following components:
+      include/CL/ -- header files including opencl.h 
+      bin/ -- binaries including clang/clang++ (and clinfo for convenience)
+      lib/ -- libraries including libOpenCL.so and implementation specifics (like libamdocl64.so, libhsa-runtime64.so, librocprofiler-register.so, libamd_comgr.so.2)
+      lib/clang/NN/include/ -- some clang related inclues and libs
+      lib/clang/NN/lib/ -- libclang_rt.builtins-x86_64.a
+
+[export AMDGPU_ARCHS=gfx906;gfx908                 # List of AMDGPU archs to compile, not used at the moment]
+export EXTERNAL_DIR=/your/Externals/path           # Path to Exteranls with the OpenCL compiler
+export CLANG_DIR=/your/clang/bin                   # Path to llvm-test-suite build
+export TEST_SUITE_DIR=/path/to/test-suite-sources  # Path to llvm-test-suite sources
+
+Configure, build and run tests:
+
+```
+$ mkdir build-llvm-test-suite
+$ cd build-llvm-test-suite
+
+$ export EXTERNAL_DIR=/repos/Temp/LlvmTestSuite/Externals
+$ export TEST_SUITE_DIR=/repos/llvm-test-suite
+$ export CLANG_DIR=$EXTERNAL_DIR/opencl/opencl-6.3.0-14740
+
+$ cmake -G Ninja -DTEST_SUITE_EXTERNALS_DIR=$EXTERNAL_DIR -DAMDGPU_ARCHS=$AMDGPU_ARCHS -DCMAKE_CXX_COMPILER="$CLANG_DIR/bin/amdclang++" -DCMAKE_C_COMPILER="$CLANG_DIR/bin/amdclang" $TEST_SUITE_DIR 
+
+$ ninja opencl-tests-simple             # To build all the tests
+$ ninja opencl-tests-simple-HelloWorld  # To build a apecific test
+$ ninja check-opencl-simple             # To build & run all the tests
+$ ninja check-opencl-simple-HelloWorld  # To build & run a specific test
+```
+
diff --git a/External/OpenCL/tests/AMDAPP_SDK/BufferBandwidth/BufferBandwidth.cpp b/External/OpenCL/tests/AMDAPP_SDK/BufferBandwidth/BufferBandwidth.cpp
new file mode 100644
index 0000000000..b382dcfcac
--- /dev/null
+++ b/External/OpenCL/tests/AMDAPP_SDK/BufferBandwidth/BufferBandwidth.cpp
@@ -0,0 +1,1387 @@
+/**********************************************************************
+Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+•	Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+•	Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or
+ other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+********************************************************************/
+
+#include "Shared.h"
+#include "Host.h"
+#include "Log.h"
+#include "Timer.h"
+
+#define SUCCESS 0
+#define FAILURE 1
+#define EXPECTED_FAILURE 2
+
+#define SAMPLE_VERSION "AMD-APP-SDK-v2.9.214.1"
+
+#define  WS 128       // work group size
+
+int nLoops;           // overall number of timing loops
+int loop;             // number of read/write buffer loops
+int nRepeats;         // # of repeats for each transfer step
+int nSkip;            // to discount lazy allocation effects, etc.
+int nKLoops;          // repeat inside kernel to show peak mem B/W
+int nBytes;           // input and output buffer size
+int nThreads;         // number of Device work items
+int nItems;           // number of 32-bit 4-vectors for Device kernel
+int nAlign;           // safe bet for most PCs
+
+int nBytesResult;
+
+bool printLog;
+bool doHost;
+int  whichTest;
+
+bool mapRW;
+bool timings;
+int  nWF;
+double setupTime;
+
+TestLog *tlog;
+bool vFailure = false;
+bool signalA = false;
+bool signalB = false;
+void *memIn,
+     *memOut,
+     *memResult,
+     *memScratch;
+
+cl_mem inputBuffer,
+       outputBuffer,
+       resultBuffer,
+	   resultBuffer2,
+       copyBuffer;
+
+void usage()
+{
+            std::cout << "\nOptions:\n\n" <<
+                 "   -type <n>          Type of test:\n\n" <<
+                 "                      0 clEnqueue[Map,Unmap]\n" <<
+                 "                      1 clEnqueue[Read,Write]\n" <<
+                 "                      2 clEnqueueCopy\n" <<
+                 "                      3 clEnqueue[Read,Write], prepinned\n\n" <<
+                 "   -pcie or -dma      Measure PCIe/interconnect bandwidth\n" <<
+				 "   -noblock           When -pcie is active, measure PCIe/interconnect\n" <<
+				 "                      bandwidth using multiple back-to-back asynchronous\n" <<
+				 "                      buffer copies\n" <<
+				 "   -lp <n>            When -noblock is active, set the number of read/write\n" << 
+				 "                      buffer\n" <<
+				 "   -nwk <n>           Number of CPU workers (max: 32 (Linux: 1))\n" <<
+                 "   -nl <n>            Number of timing loops\n" <<		
+                 "   -nr <n>            Repeat each timing <n> times (can't be 0)\n" <<
+                 "   -nk <n>            Number of loops in kernel\n" <<
+                 "   -nb <n>            Buffer size in bytes (min: 2048*CPU Workers)\n" <<
+                 "   -nw <n>            # of wave fronts per SIMD (can't be 0)\n" <<
+                 "                      (default: 7)\n" <<
+                 "   -l                 Print complete timing log\n" <<
+                 "   -s <n>             Skip first <n> timings for average\n" <<
+                 "                      (default: 1)\n" << 
+                 "   -[if,of,cf] <n>    Input, output, copy flags\n" <<
+                 "                      (ok to use multiple):\n\n";
+
+            for(int i = 0; i < nFlags; i++ )
+                std::cout << "                      " << i << " " << flags[i].s << "\n";
+            std::cout << "\n";
+
+            std::cout << "   -m                 always map as MAP_READ | MAP_WRITE\n" <<
+                         "   -db                disable host mem B/W baseline\n" <<
+                         "   -d <n>             Number of Device device\n" << 
+						 "                      (always be 0, mean use the first device)\n" <<
+						 "   -t                 Print all timings including setup-time\n" <<
+                         "   -h                 print this message\n\n" <<
+                         "   -v                 print the APP SDK version\n\n";
+            exit(SUCCESS);
+}
+
+void parseOptions(int argc, char * argv[])
+{
+    while(--argc) 
+    {
+#ifdef _WIN32
+        if( strcmp(argv[argc], "-nwk") == 0 )
+		{
+			if ((argv[ argc + 1 ]) == NULL)
+			{
+				std::cout<<"Argument needed for -nwk!"<<std::endl;
+				exit(EXPECTED_FAILURE);
+			}
+			else
+			{
+				nWorkers = atoi( argv[ argc + 1 ] );
+				if (nWorkers <= 0)
+				{
+					std::cout<<"Invalid argument for -nwk!"<<std::endl;
+					exit(EXPECTED_FAILURE);
+				}
+			}
+		}
+            
+#endif
+        if( strcmp(argv[argc], "-nl") == 0 )           
+		{
+			if ((argv[ argc + 1 ]) == NULL)
+			{
+				std::cout<<"Argument needed for -nl!"<<std::endl;
+				exit(EXPECTED_FAILURE);
+			}
+			else
+			{
+				nLoops = atoi( argv[ argc + 1 ] );
+				if (nLoops <= 0)
+				{
+					std::cout<<"Invalid argument for -nl!"<<std::endl;
+					exit(EXPECTED_FAILURE);
+				}
+			}
+		}
+		if( strcmp(argv[argc], "-lp") == 0 )           
+		{
+			if ((argv[ argc + 1 ]) == NULL)
+			{
+				std::cout<<"Argument needed for -lp!"<<std::endl;
+				exit(EXPECTED_FAILURE);
+			}
+			else
+			{
+				loop = atoi( argv[ argc + 1 ] );
+				if (loop <= 0)
+				{
+					std::cout<<"Invalid argument for -lp!"<<std::endl;
+					exit(EXPECTED_FAILURE);
+				}
+			}
+		}
+        if( strcmp(argv[argc], "-nb") == 0 )          
+		{
+			if ((argv[ argc + 1 ]) == NULL)
+			{
+				std::cout<<"Argument needed for -nb!"<<std::endl;
+				exit(EXPECTED_FAILURE);
+			}
+			else
+			{
+				nBytes = atoi( argv[ argc + 1 ] );
+				if (nBytes <= 0)
+				{
+					std::cout<<"Invalid argument for -nb!"<<std::endl;
+					exit(EXPECTED_FAILURE);
+				}
+			}
+		}
+
+        if( strcmp(argv[argc], "-nr") == 0 )           
+		{
+			if ((argv[ argc + 1 ]) == NULL)
+			{
+				std::cout<<"Argument needed for -nr!"<<std::endl;
+				exit(EXPECTED_FAILURE);
+			}
+			else
+			{
+				nRepeats = atoi( argv[ argc + 1 ] );
+				if (nRepeats <= 0)
+				{
+					std::cout<<"Invalid argument for -nr!"<<std::endl;
+					exit(EXPECTED_FAILURE);
+				}	
+			}
+		}
+
+        if( strcmp(argv[argc], "-nk") == 0 )          
+		{
+			if ((argv[ argc + 1 ]) == NULL)
+			{
+				std::cout<<"Argument needed for -nk!"<<std::endl;
+				exit(EXPECTED_FAILURE);
+			}
+			else
+			{
+				nKLoops = atoi( argv[ argc + 1 ] );
+				if (nKLoops <= 0)
+				{
+					std::cout<<"Invalid argument for -nk!"<<std::endl;
+					exit(EXPECTED_FAILURE);
+				}
+			}
+		}
+
+        if( strcmp(argv[argc], "-nw") == 0 )         
+		{
+			if ((argv[ argc + 1 ]) == NULL)
+			{
+				std::cout<<"Argument needed for -nw!"<<std::endl;
+				exit(EXPECTED_FAILURE);
+			}
+			else 
+			{
+				nWF = atoi( argv[ argc + 1 ] );
+				if (nWF <= 0)
+				{
+					std::cout<<"Invalid argument for -nw!"<<std::endl;
+					exit(EXPECTED_FAILURE);
+				}	
+			}
+					
+		}
+
+        if( strcmp(argv[argc], "-type") == 0 )
+		{
+			if ((argv[ argc + 1 ]) == NULL)
+			{
+				std::cout<<"Argument needed for -type!"<<std::endl;
+				exit(EXPECTED_FAILURE);
+			}
+			else
+			{
+				whichTest = atoi( argv[ argc + 1 ] );
+			}
+		}
+
+		if( strcmp(argv[argc], "-pcie") == 0 ||
+		    strcmp(argv[argc], "-dma") == 0 )
+		{    
+			signalA = true;
+		}
+
+		if(strcmp(argv[argc], "-noblock") == 0)
+		{    
+			signalB = true;
+		}
+
+		if( strcmp(argv[argc], "-s") == 0 )           
+		{
+			if ((argv[ argc + 1 ]) == NULL)
+			{
+				std::cout<<"Argument needed for -s!"<<std::endl;
+				exit(EXPECTED_FAILURE);
+			}
+			else
+			{
+				nSkip = atoi( argv[ argc + 1 ] );
+				if (nSkip <= 0)
+				{
+					std::cout<<"Invalid argument for -s!"<<std::endl;
+					exit(EXPECTED_FAILURE);
+				}	
+			}
+		}
+
+        if( strcmp(argv[argc], "-l") == 0 )
+            printLog = true;
+
+        if( strcmp(argv[argc], "-if") == 0 )
+        {
+			if ((argv[ argc + 1 ]) == NULL)
+			{
+				std::cout<<"Argument needed for -if!"<<std::endl;
+				exit(EXPECTED_FAILURE);
+			}
+			else
+			{
+				 int f = atoi( argv[ argc + 1 ] );
+				 if( f < nFlags )
+                 inFlags |= flags[ f ].f;
+			}           
+        }
+
+        if( strcmp(argv[argc], "-of") == 0 )
+		{
+			if ((argv[ argc + 1 ]) == NULL)
+			{
+				std::cout<<"Argument needed for -of!"<<std::endl;
+				exit(EXPECTED_FAILURE);
+			}
+			else
+			{
+				int f = atoi( argv[ argc + 1 ] );
+                if( f < nFlags )
+                outFlags |= flags[ f ].f;
+			}           
+        }
+
+        if( strcmp(argv[argc], "-cf") == 0 )
+		{
+			if ((argv[ argc + 1 ]) == NULL)
+			{
+				std::cout<<"Argument needed for -cf!"<<std::endl;
+				exit(EXPECTED_FAILURE);
+			}
+			else
+			{
+				int f = atoi( argv[ argc + 1 ] );
+                if( f < nFlags )
+                copyFlags |= flags[ f ].f;
+			}           
+        }      
+
+        if( strcmp(argv[argc], "-db") == 0 )
+        {
+            doHost = false;
+        }
+
+        if( strcmp(argv[argc], "-m") == 0 )
+        {
+            mapRW = true;
+        }
+
+		if( strcmp(argv[argc], "-t") == 0 )
+        {
+			timings = true;
+        }
+
+        if( strcmp(argv[argc], "-d") == 0 )
+		{
+			if ((argv[ argc + 1 ]) == NULL)
+			{
+				std::cout<<"Argument needed for -d!"<<std::endl;
+				exit(EXPECTED_FAILURE);
+			}
+			else
+			{
+				devnum = atoi( argv[ argc + 1 ] );
+				if (devnum != 0)
+				{
+					std::cout<<"Invalid argument for -d!"<<std::endl;
+					exit(EXPECTED_FAILURE);
+				}
+			}
+		}
+
+        if( strcmp(argv[argc], "-h") == 0 )
+           usage();
+
+        if( strcmp(argv[argc], "-v") == 0 )
+        {
+            std::cout << "SDK version : " << SAMPLE_VERSION << std::endl;
+            exit(0);
+        }
+    }
+
+    if( whichTest > 3 || whichTest < 0 )
+	{
+		std::cout<<"Testtype index should be between 0 and 3!"<<std::endl;
+		usage();
+	}
+    
+}
+
+void timedBufMappedRead( cl_command_queue queue,
+                         cl_mem buf,
+                         unsigned char v,
+                         bool pcie )
+{
+    CPerfCounter t1, t2, t3;
+    cl_int ret;
+    cl_event ev;
+    void *ptr;
+    cl_map_flags mapFlag = CL_MAP_READ | CL_MAP_WRITE;
+
+    t1.Reset();
+    t2.Reset();
+    t3.Reset();
+
+    t1.Start();
+
+    if( !mapRW )
+        mapFlag = CL_MAP_READ;
+
+    ptr = (void * ) clEnqueueMapBuffer( queue,
+                                        buf,
+                                        CL_FALSE,
+                                        mapFlag,
+                                        0,
+                                        nBytes,
+                                        0, NULL,
+                                        &ev,
+                                        &ret );
+    ASSERT_CL_RETURN( ret );
+
+    clFlush( queue );
+    spinForEventsComplete( 1, &ev );
+
+    t1.Stop();
+
+    t2.Start();
+
+    bool verify = readVerifyMemCPU_MT( ptr, v, nBytes );
+	if(!verify)
+		vFailure = true;
+
+    t2.Stop();
+
+    t3.Start();
+
+    ret = clEnqueueUnmapMemObject( queue,
+                                   buf,
+                                   (void *) ptr,
+                                   0, NULL, &ev );
+    ASSERT_CL_RETURN( ret );
+
+    clFlush( queue );
+    spinForEventsComplete( 1, &ev );
+
+    t3.Stop();
+
+    if( pcie )
+    {
+        tlog->Timer( "PCIe B/W host->device (GBPS)", t1.GetElapsedTime(), nBytes, 1 );
+    }
+    else
+    {
+        const char *msg;
+
+        if( mapRW )
+            msg = "clEnqueueMapBuffer -- READ|WRITE (GBPS)";
+        else
+            msg = "clEnqueueMapBuffer -- READ (GBPS)";
+
+        tlog->Timer( msg, t1.GetElapsedTime(), nBytes, 1 );
+        tlog->Timer( "CPU read (GBPS)", t2.GetElapsedTime(), nBytes, 1 ); 
+		tlog->Timer( "clEnqueueUnmapMemObject() (GBPS)", t3.GetElapsedTime(), nBytes, 1 );
+
+	    if( verify )
+            tlog->Msg( "\n Verification Passed!\n", "" );
+        else
+        {
+            tlog->Error( "\n Verification Failed!\n", "" );
+            vFailure = true;
+        }
+    }
+}
+
+void timedBufMappedWrite( cl_command_queue queue,
+                          cl_mem buf,
+                          unsigned char v,
+                          bool pcie )
+{
+    CPerfCounter t1, t2, t3;
+    cl_int ret;
+    cl_event ev;
+    void *ptr;
+    cl_map_flags mapFlag = CL_MAP_READ | CL_MAP_WRITE;
+
+    t1.Reset();
+    t2.Reset();
+    t3.Reset();
+
+    t1.Start();
+
+    if( !mapRW )
+        mapFlag = CL_MAP_WRITE_INVALIDATE_REGION;
+
+    ptr = (void * ) clEnqueueMapBuffer( queue,
+                                        buf,
+                                        CL_FALSE,
+                                        mapFlag,
+                                        0,
+                                        nBytes,
+                                        0, NULL, 
+                                        &ev,
+                                        &ret );
+    ASSERT_CL_RETURN( ret );
+
+    clFlush( queue );
+    spinForEventsComplete( 1, &ev );
+
+    t1.Stop();
+
+    t2.Start();
+
+    memset_MT( ptr, v, nBytes );
+
+    t2.Stop();
+
+    t3.Start();
+
+    ret = clEnqueueUnmapMemObject( queue,
+                                   buf,
+                                   (void *) ptr,
+                                   0, NULL, &ev );
+    ASSERT_CL_RETURN( ret );
+
+    clFlush( queue );
+    spinForEventsComplete( 1, &ev );
+
+    t3.Stop();
+
+    if( pcie )
+    {
+        tlog->Timer( "PCIe B/W device->host (GBPS)", t3.GetElapsedTime(), nBytes, 1 );
+    }
+    else
+    {
+        const char *msg;
+
+        if( mapRW )
+            msg = "clEnqueueMapBuffer -- READ|WRITE (GBPS)";
+        else
+            msg = "clEnqueueMapBuffer -- WRITE (GBPS)";
+
+        tlog->Timer( msg, t1.GetElapsedTime(), nBytes, 1 );
+
+        tlog->Timer( "memset() (GBPS)", t2.GetElapsedTime(), nBytes, 1 );
+
+        tlog->Timer( "clEnqueueUnmapMemObject() (GBPS)", t3.GetElapsedTime(), nBytes, 1 );
+    }
+}
+
+void timedBufMap( cl_command_queue queue,
+                  cl_mem buf,
+                  void **ptr,
+                  bool quiet )
+{
+    CPerfCounter t1;
+    cl_int ret;
+    cl_event ev;
+    cl_map_flags mapFlag = CL_MAP_READ | CL_MAP_WRITE;
+
+    t1.Reset();
+    t1.Start();
+
+    *ptr = (void * ) clEnqueueMapBuffer( queue,
+                                         buf,
+                                         CL_FALSE,
+                                         mapFlag,
+                                         0,
+                                         nBytes,
+                                         0, NULL,
+                                         &ev,
+                                         &ret );
+    ASSERT_CL_RETURN( ret );
+
+    clFlush( queue );
+    spinForEventsComplete( 1, &ev );
+
+    t1.Stop();
+
+    const char *msg;
+
+    if( mapRW )
+        msg = "clEnqueueMapBuffer -- READ|WRITE (GBPS)";
+    else
+        msg = "clEnqueueMapBuffer -- READ (GBPS)";
+
+    if( !quiet )
+        tlog->Timer( msg, t1.GetElapsedTime(), nBytes, 1 );
+}
+
+void timedBufUnmap( cl_command_queue queue,
+                    cl_mem buf,
+                    void **ptr,
+                    bool quiet )
+{
+    CPerfCounter t1;
+    cl_int ret;
+    cl_event ev;
+
+    t1.Reset();
+    t1.Start();
+
+    ret = clEnqueueUnmapMemObject( queue,
+                                   buf,
+                                   (void *) *ptr,
+                                   0, NULL, &ev );
+    ASSERT_CL_RETURN( ret );
+
+    clFlush( queue );
+    spinForEventsComplete( 1, &ev );
+
+    t1.Stop();
+
+    if( !quiet )
+        tlog->Timer( "clEnqueueUnmapMemObject() (GBPS)", t1.GetElapsedTime(), nBytes, 1 );
+}
+
+void timedBufCLRead( cl_command_queue queue,
+                     cl_mem buf,
+                     void *ptr,
+                     unsigned char v,
+                     bool pcie )
+{
+    CPerfCounter t1;
+    cl_int ret;
+    cl_event ev;
+
+    t1.Start();
+
+    ret = clEnqueueReadBuffer( queue,
+                               buf,
+                               CL_FALSE,
+                               0,
+                               nBytes,
+                               ptr,
+                               0, NULL,
+                               &ev );
+    ASSERT_CL_RETURN( ret );
+
+    clFlush( queue );
+    spinForEventsComplete( 1, &ev );
+
+    t1.Stop();
+
+    bool verify = readVerifyMemCPU_MT( ptr, v, nBytes );
+	if(!verify)
+		vFailure = true;
+
+    if( pcie )
+    {
+        tlog->Timer("PCIe B/W device->host (GBPS)", t1.GetElapsedTime(), nBytes, 1 );
+    }
+    else
+    {
+        tlog->Timer( "clEnqueueReadBuffer (GBPS)", t1.GetElapsedTime(), nBytes, 1 );
+    }
+}
+
+void timedBufCLWrite( cl_command_queue queue,
+                      cl_mem buf,
+                      void *ptr,
+                      unsigned char v,
+                      bool pcie )
+{
+    CPerfCounter t1;
+    cl_int ret;
+    cl_event ev;
+
+    memset( ptr, v, nBytes );
+
+    t1.Start();
+
+    ret = clEnqueueWriteBuffer( queue,
+                                buf,
+                                CL_FALSE,
+                                0,
+                                nBytes,
+                                ptr,
+                                0, NULL,
+                                &ev );
+    ASSERT_CL_RETURN( ret );
+
+    clFlush( queue );
+    spinForEventsComplete( 1, &ev );
+
+    t1.Stop();
+
+    if( pcie )
+    {
+        tlog->Timer( "PCIe B/W host->device (GBPS)", t1.GetElapsedTime(), nBytes, 1 );
+    }
+    else
+    {
+        tlog->Timer( "clEnqueueWriteBuffer (GBPS)", t1.GetElapsedTime(), nBytes, 1 );
+    }
+}
+
+void timedBufCLCopy( cl_command_queue queue,
+                     cl_mem srcBuf,
+                     cl_mem dstBuf )
+{
+    CPerfCounter t1;
+    cl_int ret;
+    cl_event ev;
+
+    t1.Start();
+
+    ret = clEnqueueCopyBuffer( queue,
+                               srcBuf,
+                               dstBuf,
+                               0, 0,
+                               nBytes,
+                               0, NULL,
+                               &ev );
+    ASSERT_CL_RETURN( ret );
+
+    clFlush( queue );
+    spinForEventsComplete( 1, &ev );
+
+    t1.Stop();
+
+    tlog->Timer( "clEnqueueCopyBuffer (GBPS)", t1.GetElapsedTime(), nBytes, 1 );
+}
+
+void timedKernel( cl_command_queue queue,
+                  cl_kernel        kernel,
+                  cl_mem           bufSrc,
+                  cl_mem           bufDst,
+                  unsigned char    v,
+                  bool             quiet )
+{
+     cl_int ret;
+     cl_event ev=0;
+     CPerfCounter t;
+
+     cl_uint nItemsPerThread = nItems / nThreads;
+
+     size_t global_work_size[2] = { nThreads, 0 };
+     size_t local_work_size[2] = { WS, 0 };
+
+     cl_uint val=0;
+
+     for(int i = 0; i < sizeof( cl_uint ); i++)
+        val |= v << (i * 8);
+
+     clSetKernelArg( kernel, 0, sizeof(void *),  (void *) &bufSrc );
+     clSetKernelArg( kernel, 1, sizeof(void *),  (void *) &bufDst );
+     clSetKernelArg( kernel, 2, sizeof(cl_uint), (void *) &nItemsPerThread);
+     clSetKernelArg( kernel, 3, sizeof(cl_uint), (void *) &val);
+     clSetKernelArg( kernel, 4, sizeof(cl_uint), (void *) &nKLoops);
+
+     t.Reset();
+     t.Start();
+
+     ret = clEnqueueNDRangeKernel( queue,
+                                   kernel,
+                                   1,
+                                   NULL,
+                                   global_work_size,
+                                   local_work_size,
+                                   0, NULL, &ev );
+     ASSERT_CL_RETURN( ret );
+
+     clFlush( queue );
+     spinForEventsComplete( 1, &ev );
+
+     t.Stop();
+
+     if( !quiet )
+         tlog->Timer( "clEnqueueNDRangeKernel() (GBPS)", 
+                      t.GetElapsedTime(), nBytes, nKLoops );
+}
+
+void timedReadKernelVerify( cl_command_queue queue,
+                            cl_kernel        kernel,
+                            cl_mem           bufSrc,
+                            cl_mem           bufRes,
+                            unsigned char    v,
+                            bool             quiet )
+{
+    cl_int ret;
+    cl_event ev;
+
+    timedKernel( queue, kernel, bufSrc, bufRes, v, quiet );
+
+    ret = clEnqueueReadBuffer( queue,
+                               bufRes,
+                               CL_FALSE,
+                               0,
+                               nBytesResult,
+                               memResult,
+                               0, NULL,
+                               &ev );
+
+    ASSERT_CL_RETURN( ret );
+
+    clFlush( queue );
+    spinForEventsComplete( 1, &ev );
+
+     cl_uint sum = 0;
+
+     for(int i = 0; i < nThreads / WS; i++)
+         sum += ((cl_uint *) memResult)[i];
+
+     bool verify;
+
+     if( sum == nBytes / sizeof(cl_uint) )
+         verify = true;
+     else
+     {
+         verify = false;
+         vFailure = true;
+     }
+
+	 if( !quiet )
+     {
+        if( verify )
+            tlog->Msg( "\n Verification Passed!\n", "" );
+        else
+            tlog->Error( "\n Verification Failed!\n", "" );
+     }
+}
+
+void createBuffers()
+{
+   // host memory buffers
+
+#ifdef _WIN32
+   memIn =      (void *) _aligned_malloc( nBytes, nAlign );
+   memOut =     (void *) _aligned_malloc( nBytes, nAlign );
+   memResult =  (void *) _aligned_malloc( nBytesResult, nAlign );
+   memScratch = (void *) _aligned_malloc( nBytes, nAlign );
+#else
+   memIn =      (void *) memalign( nAlign, nBytes );
+   memOut =     (void *) memalign( nAlign, nBytes );
+   memResult =  (void *) memalign( nAlign, nBytesResult );
+   memScratch = (void *) memalign( nAlign, nBytes );
+#endif
+
+   if( memIn == NULL ||
+       memOut == NULL ||
+       memResult == NULL ||
+       memScratch == NULL ) 
+   {
+       fprintf( stderr, "%s:%d: error: %s\n", \
+                __FILE__, __LINE__, "could not allocate host buffers\n" );
+       exit(FAILURE);
+   }
+
+   // CL buffers
+
+   cl_int ret;
+   void *hostPtr = NULL;
+
+   if( inFlags & CL_MEM_USE_HOST_PTR ||
+       inFlags & CL_MEM_COPY_HOST_PTR )
+       hostPtr = memIn;
+
+   inputBuffer = clCreateBuffer( context,
+                                 inFlags,
+                                 nBytes,
+                                 hostPtr, &ret );
+
+   ASSERT_CL_RETURN( ret );
+
+   hostPtr = NULL;
+
+   if( outFlags & CL_MEM_USE_HOST_PTR ||
+       outFlags & CL_MEM_COPY_HOST_PTR )
+       hostPtr = memOut;
+
+   outputBuffer = clCreateBuffer( context,
+                                  outFlags,
+                                  nBytes,
+                                  hostPtr, &ret );
+
+   ASSERT_CL_RETURN( ret );
+
+   hostPtr = NULL;
+
+   if( copyFlags & CL_MEM_USE_HOST_PTR ||
+       copyFlags & CL_MEM_COPY_HOST_PTR )
+       hostPtr = memScratch;
+
+   if( whichTest == 2 ||
+       whichTest == 3 ||
+	   signalA )
+      copyBuffer = clCreateBuffer( context,
+                                   copyFlags,
+                                   nBytes,
+                                   hostPtr, &ret );
+
+   ASSERT_CL_RETURN( ret );
+
+   resultBuffer = clCreateBuffer( context,
+                                  CL_MEM_READ_WRITE,
+                                  nBytesResult,
+                                  NULL, &ret );
+   ASSERT_CL_RETURN( ret );
+
+   resultBuffer2 = clCreateBuffer( context,
+                                  CL_MEM_READ_WRITE,
+                                  nBytesResult,
+                                  NULL, &ret );
+   ASSERT_CL_RETURN( ret );
+}
+
+void printHeader()
+{
+	std::cout << "\nDevice " << std::setw(2) << devnum << "            " << devname << "\n";
+
+#ifdef _WIN32
+    std::cout << "Build:               _WINxx"; 
+#ifdef _DEBUG
+    std::cout << " DEBUG";
+#else
+    std::cout << " release";
+#endif
+    std::cout << "\n" ;
+#else
+#ifdef NDEBUG
+    std::cout << "Build:               release\n";
+#else
+    std::cout << "Build:               DEBUG\n";
+#endif
+#endif
+
+    std::cout << "Device work items:      " << nThreads << std::endl <<
+                 "Buffer size:         " << nBytes << std::endl <<
+                 "CPU workers:         " << nWorkers << std::endl <<
+                 "Timing loops:        " << nLoops << std::endl <<
+                 "Repeats:             " << nRepeats << std::endl <<
+                 "Kernel loops:        " << nKLoops << std::endl;
+
+    std::cout << "inputBuffer:         ";
+
+   for( int i = 0; i < nFlags; i++ )
+      if( inFlags & flags[i].f )
+          std::cout << flags[i].s << " ";
+
+   std::cout << "\noutputBuffer:        ";
+
+   for( int i = 0; i < nFlags; i++ )
+      if( outFlags & flags[i].f )
+         std::cout << flags[i].s << " ";
+
+   if( whichTest == 2 ||
+       whichTest == 3 ||
+       signalA)
+   {
+       std::cout << "\ncopyBuffer:          " ;
+
+      for( int i = 0; i < nFlags; i++ )
+      if( copyFlags & flags[i].f )
+          std::cout << flags[i].s << " ";
+   }
+
+   std::cout << "\n\n";
+}
+
+void printResults()
+{
+    if(timings)
+	{
+		std::cout << std::setw(21) << std::left << "Setup Time"
+				<< setupTime << " secs" << std::endl;
+	}
+	
+	if( printLog ) 
+      tlog->printLog();
+
+   tlog->printSummary( nSkip );
+
+   std::cout << "\n" ;
+   fflush(stdout);
+}
+
+void runMapTest()
+{
+   int  nl = nLoops;
+
+   while( nl-- )
+   {
+      tlog->loopMarker();
+
+      tlog->Msg( "\n\n%s\n", "1. Host mapped write to inputBuffer" );
+
+      for(int i = 0; i < nRepeats; i++)
+          timedBufMappedWrite( queue, inputBuffer, nl & 0xff, false );
+
+      tlog->Msg( "\n\n%s\n", "2. Device kernel read of inputBuffer" );
+
+      for(int i = 0; i < nRepeats; i++)
+          timedReadKernelVerify( queue, read_kernel, inputBuffer, resultBuffer, nl & 0xff, false );
+
+      tlog->Msg( "\n\n%s\n", "3. Device kernel write to outputBuffer" );
+
+      for(int i = 0; i < nRepeats; i++)
+          timedKernel( queue, write_kernel, resultBuffer, outputBuffer, nl & 0xff, false );
+
+      tlog->Msg( "\n\n%s\n", "4. Host mapped read of outputBuffer" );
+
+      for(int i = 0; i < nRepeats; i++)
+          timedBufMappedRead( queue, outputBuffer, nl & 0xff, false );
+
+      tlog->Msg( "%s\n", "" );
+   }
+}
+
+void runPCIeTest()
+{
+   int nl = nLoops;
+
+   void *mappedPtr;
+   while( nl-- )
+   {
+      tlog->loopMarker();
+
+      tlog->Msg( "%s\n", "" );
+
+      timedBufMap( queue, copyBuffer, &mappedPtr, true );
+	  
+      for(int i = 0; i < nRepeats; i++)
+          timedBufCLWrite( queue, inputBuffer, mappedPtr, nl & 0xff, true );
+	  
+      for(int i = 0; i < nRepeats; i++)
+          timedReadKernelVerify( queue, read_kernel, inputBuffer, resultBuffer, nl & 0xff, true );
+
+      for(int i = 0; i < nRepeats; i++)
+          timedKernel( queue, write_kernel, resultBuffer, outputBuffer, nl & 0xff, true );
+	 
+      for(int i = 0; i < nRepeats; i++)
+          timedBufCLRead( queue, outputBuffer, mappedPtr, nl & 0xff, true );
+
+      timedBufUnmap( queue, copyBuffer, &mappedPtr, true );
+
+      tlog->Msg( "%s\n", "" );
+   }
+}
+void runPCIeTestNoblock()
+{
+   	int nl = nLoops;
+    void *mappedPtr;
+
+	while( nl-- )
+	{
+		tlog->loopMarker();
+
+		tlog->Msg( "%s\n", "" );
+
+		timedBufMap( queue, copyBuffer, &mappedPtr, true );
+		CPerfCounter t1;
+		cl_int ret;
+		cl_event ev;
+		bool flag=true;
+		memset( mappedPtr, nl & 0xff, nBytes );
+		t1.Reset();
+		t1.Start();
+		for(int i = 0; i < loop; i++)
+		{
+			ret = clEnqueueWriteBuffer( queue,
+										inputBuffer,
+										CL_FALSE,
+										0,
+										nBytes,
+										mappedPtr,
+										0, NULL,
+										&ev );
+			ASSERT_CL_RETURN( ret );
+		}
+		clFlush( queue );
+		spinForEventsComplete( 1, &ev );
+
+		t1.Stop();
+		double avg=(t1.GetElapsedTime())/loop;
+		if( flag )
+		{
+			tlog->Timer( "PCIe B/W host->device (GBPS)", avg, nBytes, 1 );
+		}
+		else
+		{
+			tlog->Timer( "clEnqueueWriteBuffer (GBPS)", avg, nBytes, 1 );
+		}
+		timedReadKernelVerify( queue, read_kernel, inputBuffer, resultBuffer, nl & 0xff, true );
+
+		timedKernel( queue, write_kernel, resultBuffer, outputBuffer, nl & 0xff, true );
+		t1.Reset();
+		t1.Start();
+		for(int i = 0; i < loop; i++)
+		{
+			ret = clEnqueueReadBuffer( queue,
+								   outputBuffer,
+								   CL_FALSE,
+								   0,
+								   nBytes,
+								   mappedPtr,
+								   0, NULL,
+								   &ev );
+			ASSERT_CL_RETURN( ret );
+		}
+		clFlush( queue );
+		spinForEventsComplete( 1, &ev );
+
+		t1.Stop();
+		avg=(t1.GetElapsedTime())/loop;
+		bool verify = readVerifyMemCPU_MT( mappedPtr, nl & 0xff, nBytes );
+
+		if(flag)
+		{
+			tlog->Timer( "PCIe B/W device->host (GBPS)", avg, nBytes, 1 );
+		}
+		else
+		{
+			tlog->Timer( "clEnqueueReadBuffer (GBPS)", avg, nBytes, 1 );
+		}
+		timedBufUnmap( queue, copyBuffer, &mappedPtr, true );
+
+		if(!flag)
+		{
+			if(!verify)
+			{
+				vFailure = true;
+			}
+		}
+		tlog->Msg( "%s\n", "" );
+	}
+}
+
+void runReadWriteTest()
+{
+   
+   int nl = nLoops;
+
+   while( nl-- )
+   {
+      tlog->loopMarker();
+
+      tlog->Msg( "\n\n%s\n", "1. Host CL write to inputBuffer" );
+	 
+      for(int i = 0; i < nRepeats; i++)
+          timedBufCLWrite( queue, inputBuffer, memScratch, nl & 0xff, false );
+	  
+      tlog->Msg( "\n\n%s\n", "2. Device kernel read of inputBuffer" );
+
+      for(int i = 0; i < nRepeats; i++)
+          timedReadKernelVerify( queue, read_kernel, inputBuffer, resultBuffer, nl & 0xff, false );
+
+      tlog->Msg( "\n\n%s\n", "3. Device kernel write to outputBuffer" );
+
+      for(int i = 0; i < nRepeats; i++)
+          timedKernel( queue, write_kernel, resultBuffer, outputBuffer, nl & 0xff, false );
+
+      tlog->Msg( "\n\n%s\n", "4. Host CL read of outputBuffer" );
+
+      for(int i = 0; i < nRepeats; i++)
+          timedBufCLRead( queue, outputBuffer, memScratch, nl & 0xff, false );
+	 
+      tlog->Msg( "%s\n", "" );
+   }
+}
+
+void runMappedReadWriteTest()
+{
+   int nl = nLoops;
+
+   void *mappedPtr;
+
+   while( nl-- )
+   {
+      tlog->loopMarker();
+
+      tlog->Msg( "\n\n%s\n", "1. Mapping copyBuffer as mappedPtr" );
+
+      timedBufMap( queue, copyBuffer, &mappedPtr, false );
+
+      tlog->Msg( "\n\n%s\n", "2. Host CL write from mappedPtr to inputBuffer" );
+
+      for(int i = 0; i < nRepeats; i++)
+          timedBufCLWrite( queue, inputBuffer, mappedPtr, nl & 0xff, false );
+	  
+      tlog->Msg( "\n\n%s\n", "3. Device kernel read of inputBuffer" );
+
+      for(int i = 0; i < nRepeats; i++)
+          timedReadKernelVerify( queue, read_kernel, inputBuffer, resultBuffer, nl & 0xff, false );
+
+      tlog->Msg( "\n\n%s\n", "4. Device kernel write to outputBuffer" );
+
+      for(int i = 0; i < nRepeats; i++)
+          timedKernel( queue, write_kernel, resultBuffer, outputBuffer, nl & 0xff, false );
+
+      tlog->Msg( "\n\n%s\n", "5. Host CL read of outputBuffer to mappedPtr" );
+	  
+	  
+      for(int i = 0; i < nRepeats; i++)
+          timedBufCLRead( queue, outputBuffer, mappedPtr, nl & 0xff, false );
+	  
+      tlog->Msg( "\n\n%s\n", "6. Unmapping copyBuffer" );
+
+      timedBufUnmap( queue, copyBuffer, &mappedPtr, false );
+
+      tlog->Msg( "%s\n", "" );
+   }
+}
+
+void runCopyTest()
+{
+   int nl = nLoops;
+
+   while( nl-- )
+   {
+      tlog->loopMarker();
+
+      tlog->Msg( "\n\n%s\n", "1. Host mapped write to copyBuffer" );
+
+      for(int i = 0; i < nRepeats; i++)
+          timedBufMappedWrite( queue, copyBuffer, nl & 0xff, false );
+
+      tlog->Msg( "\n\n%s\n", "2. CL copy of copyBuffer to inputBuffer" );
+
+      for(int i = 0; i < nRepeats; i++)
+          timedBufCLCopy( queue, copyBuffer, inputBuffer );
+
+      tlog->Msg( "\n\n%s\n", "3. Device kernel read of inputBuffer" );
+
+      for(int i = 0; i < nRepeats; i++)
+          timedReadKernelVerify( queue, read_kernel, inputBuffer, resultBuffer, nl & 0xff, false );
+
+      tlog->Msg( "\n\n%s\n", "4. Device kernel write to outputBuffer" );
+
+      for(int i = 0; i < nRepeats; i++)
+          timedKernel( queue, write_kernel, resultBuffer, outputBuffer, nl & 0xff, false );
+
+      tlog->Msg( "\n\n%s\n", "5. CL copy of outputBuffer to copyBuffer" );
+
+      for(int i = 0; i < nRepeats; i++)
+          timedBufCLCopy( queue, outputBuffer, copyBuffer );
+
+      tlog->Msg( "\n\n%s\n", "6. Host mapped read of copyBuffer" );
+
+      for(int i = 0; i < nRepeats; i++)
+          timedBufMappedRead( queue, copyBuffer, nl & 0xff, false );
+
+      tlog->Msg( "%s\n", "" );
+   }
+}
+
+void initDefaults()
+{
+    nWorkers = 1;
+    nLoops = 20;
+	loop = 20;
+    nRepeats = 1;
+    nSkip = 2;
+    nKLoops = 20;
+
+    nBytes = 32*1024*1024;
+    nAlign = 4096;
+
+    printLog = false;
+    doHost = true;
+    whichTest = 0;
+    mapRW = false;
+	timings = false;
+    nWF = 7;
+    devnum = 0;
+}
+
+void computeGlobals()
+{
+    if( nWorkers > MAXWORKERS ) nWorkers = MAXWORKERS;
+    if( nWorkers <= 0 ) nWorkers = 1;
+
+    cl_mem_flags f = CL_MEM_READ_ONLY |
+                     CL_MEM_WRITE_ONLY |
+                     CL_MEM_READ_WRITE;
+
+    if( (inFlags & f) == 0 )
+             inFlags |= CL_MEM_READ_ONLY;
+
+    if( (outFlags & f) == 0 )
+             outFlags |= CL_MEM_WRITE_ONLY;
+
+    f |= CL_MEM_USE_HOST_PTR |
+         CL_MEM_COPY_HOST_PTR |
+         CL_MEM_ALLOC_HOST_PTR;
+
+    /*
+    f |= CL_MEM_USE_HOST_PTR |
+         CL_MEM_COPY_HOST_PTR |
+         CL_MEM_ALLOC_HOST_PTR |
+         CL_MEM_USE_PERSISTENT_MEM_AMD;
+    */
+
+    if( (copyFlags & f) == 0 )
+             copyFlags = CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE;
+
+    f = CL_MEM_READ_ONLY |
+        CL_MEM_WRITE_ONLY |
+        CL_MEM_READ_WRITE;
+
+    if( (copyFlags & f) == 0 )
+             copyFlags |= CL_MEM_READ_WRITE;
+
+    nSkip = nLoops > nSkip ? nSkip : 0;
+
+    if( signalA )
+    {
+        inFlags = CL_MEM_READ_ONLY;
+        outFlags = CL_MEM_WRITE_ONLY;
+        copyFlags = CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE;
+        nKLoops = 1;
+        doHost = false;
+    }
+
+    // educated guess of optimal work size
+    int minBytes = WS * sizeof( cl_uint ) * 4 * nWorkers;
+
+    nBytes = ( nBytes / minBytes ) * minBytes;
+    nBytes = nBytes < minBytes ? minBytes : nBytes;
+    nItems = nBytes / ( 4 * sizeof(cl_uint) );
+    
+    int maxThreads = nBytes / ( 4 * sizeof( cl_uint ) );
+
+    nThreads = deviceMaxComputeUnits * nWF * WS;
+ 
+    if( nThreads > maxThreads )
+        nThreads = maxThreads;
+    else
+    {
+        while( nItems % nThreads != 0 )
+            nThreads += WS;
+    }
+
+    nBytesResult = ( nThreads / WS ) * sizeof(cl_uint);
+}
+
+int main(int argc, char **argv)
+{
+	initDefaults();
+    parseOptions( argc, argv );
+
+    tlog = new TestLog( nLoops * nRepeats * 50 );
+
+	initCL( (char *) "BufferBandwidth_Kernels.cl", setupTime );
+
+    computeGlobals();
+    printHeader();
+    createBuffers();
+
+	#ifdef MEM_MULTICORE
+		launchThreads();
+	#endif
+
+    if( doHost )
+       assessHostMemPerf( memIn, memOut, nBytes );
+	if(signalA)
+	{
+		if(signalB)
+		{
+			runPCIeTestNoblock();
+		}
+		else
+		{
+			runPCIeTest();
+		}
+	}
+	else
+	{
+		switch( whichTest )
+		{
+		   case 0: runMapTest(); break;
+		   case 1: runReadWriteTest(); break;
+		   case 2: runCopyTest(); break;
+		   case 3: runMappedReadWriteTest(); break;
+		}
+	}
+
+	if((!signalA) && signalB) 
+	{
+		std::cout<<"-noblock will only work when -pcie or -dma is active!"<<std::endl;
+		exit(FAILURE);
+	}
+    printResults();
+
+	#ifdef MEM_MULTICORE
+		shutdownThreads();
+	#endif
+
+    if( vFailure )
+    {
+        std::cerr << "Failed!" << std::endl;
+		return FAILURE;
+    }                                                                                                                                                                                                                 
+    else
+    {
+        std::cout << "Passed!" << std::endl;
+		return SUCCESS;
+    }
+}
diff --git a/External/OpenCL/tests/AMDAPP_SDK/BufferBandwidth/BufferBandwidth.hpp b/External/OpenCL/tests/AMDAPP_SDK/BufferBandwidth/BufferBandwidth.hpp
new file mode 100644
index 0000000000..1de147c21c
--- /dev/null
+++ b/External/OpenCL/tests/AMDAPP_SDK/BufferBandwidth/BufferBandwidth.hpp
@@ -0,0 +1,212 @@
+/**********************************************************************
+Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+•	Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+•	Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or
+ other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+********************************************************************/
+
+#ifndef BUFFER_BANDWIDTH_H_
+#define BUFFER_BANDWIDTH_H_
+
+#undef   USE_CL_PROFILER
+#define  MAX_WAVEFRONT_SIZE 64               // work group size
+
+#include "Host.h"
+#include "Log.h"
+#include "Timer.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <string.h>
+
+
+/**
+ * BufferBandwidth 
+ * Class implements OpenCL Constant Buffer Bandwidth sample
+ * Derived from SDKSample base class
+ */
+
+class BufferBandwidth 
+{
+    bool correctness;  // correctness status variable
+    bool enable;       // Enable flags -i, -r, -k, -x and -s
+    int nLoops;        // overall number of timing loops
+    int nRepeats;      // # of repeats for each transfer step
+    int nSkip;         // to discount lazy allocation effects, etc.
+    int nKLoops;       // repeat inside kernel to show peak mem B/W,
+
+    int nBytes;        // input and output buffer size
+    int nThreads;      // number of Device work items
+    int nItems;        // number of 32-bit 4-vectors for Device kernel
+    int nAlign;        // safe bet for most PCs
+    int nBytesResult;
+
+    bool printLog;
+    bool doHost;
+    int  whichTest;   // Type of the test
+    bool mapRW;
+    int  numWavefronts;
+
+    TestLog *tlog;    // Log information class
+
+    void *memIn;
+    void *memOut;
+    void *memResult;
+    void *memRW;
+
+    int inFlagsValue; 
+    int outFlagsValue;
+    int copyFlagsValue;
+ 
+public:
+    /** 
+     * Constructor 
+     * Initialize member variables
+     * @param name name of sample (string)
+     */
+    BufferBandwidth(std::string name)
+        :
+         nLoops(20),
+         nRepeats(1),
+         nSkip(2),
+         nKLoops(20),
+         nBytes(32 * 1024 * 1024),
+         nThreads(MAX_WAVEFRONT_SIZE), 
+         nItems(2), 
+         nAlign(4096),
+         nBytesResult(1024 * 1024),
+         printLog(false),
+         doHost(false),
+         whichTest(0),
+         mapRW(false), 
+         numWavefronts(7), 
+         tlog(NULL),
+         memIn(NULL), 
+         memOut(NULL),
+         memResult(NULL), 
+         memRW(NULL),
+         inFlagsValue(0), 
+         outFlagsValue(1),
+         copyFlagsValue(2),
+         correctness(true),
+         enable(false)
+    {
+    }
+
+    /** 
+     * Constructor 
+     * Initialize member variables
+     * @param name name of sample (const char*)
+     */
+    BufferBandwidth(const char* name)
+        :
+         nLoops(20),
+         nRepeats(1),
+         nSkip(2),
+         nKLoops(20),
+         nBytes(32 * 1024 * 1024), 
+         nThreads(MAX_WAVEFRONT_SIZE), 
+         nItems(2), 
+         nAlign(4096),
+         nBytesResult(1024 * 1024),
+         printLog(false),
+         doHost(false),
+         whichTest(0),
+         mapRW(false), 
+         numWavefronts(7), 
+         tlog(NULL),
+         memIn(NULL), 
+         memOut(NULL),
+         memResult(NULL), 
+         memRW(NULL),
+         inFlagsValue(0), 
+         outFlagsValue(1),
+         copyFlagsValue(2),
+         correctness(true),
+         enable(false)
+    {
+    }
+
+    /**
+     * Allocate and initialize host memory array with random values
+     * @return 1 on success and 0 on failure
+     */
+    int setupBufferBandwidth();
+
+    /**
+     * OpenCL related initialisations. 
+     * Set up Context, Device list, Command Queue, Memory buffers
+     * Build CL kernel program executable
+     * @return 1 on success and 0 on failure
+     */
+    int setupCL();
+
+
+    /**
+     * Override from SDKSample. Initialize 
+     * command line parser, add custom options
+     */
+    int initialize();
+
+    /**
+     * Override from SDKSample, Generate binary image of given kernel 
+     * and exit application
+     */
+    int genBinaryImage();
+
+    /**
+     * Override from SDKSample, adjust width and height 
+     * of execution domain, perform all sample setup
+     */
+    int setup();
+
+    /**
+     * Override from SDKSample
+     */
+    int run();
+
+    /**
+     * Override from SDKSample
+     * Cleanup memory allocations
+     */
+    int cleanup();
+
+    /**
+     * Override from SDKSample
+     * Verify against reference implementation
+     */
+    int verifyResults();
+
+    void printStats();
+
+    /**
+     * Parses Extra command line options and 
+     * calls SDKSample::parseCommandLine()
+     */
+    int parseExtraCommandLineOptions(int argc, char**argv);
+    int runMapTest();
+    int runReadWriteTest();
+    int runCopyTest();
+    int timedBufMappedRead(cl_mem buf, unsigned char v);
+    int timedBufMappedWrite(cl_mem buf, unsigned char v);
+    int timedBufCLRead(cl_mem buf, void *ptr, unsigned char v);
+    int timedBufCLWrite(cl_mem buf, void *ptr);
+    int timedBufCLCopy(cl_mem srcBuf, cl_mem dstBuf);
+    int timedKernel(cl_kernel kernel, cl_mem bufSrc, cl_mem bufDst, unsigned char v);
+    int timedReadKernelVerify(cl_kernel kernel, cl_mem bufSrc, cl_mem bufRes, unsigned char v);
+    void printLogMsg();
+
+};
+
+
+#endif
diff --git a/External/OpenCL/tests/AMDAPP_SDK/BufferBandwidth/BufferBandwidth_Kernels.cl b/External/OpenCL/tests/AMDAPP_SDK/BufferBandwidth/BufferBandwidth_Kernels.cl
new file mode 100644
index 0000000000..10b0da17e2
--- /dev/null
+++ b/External/OpenCL/tests/AMDAPP_SDK/BufferBandwidth/BufferBandwidth_Kernels.cl
@@ -0,0 +1,73 @@
+/**********************************************************************
+Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+•	Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+•	Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or
+ other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+********************************************************************/
+
+
+#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
+
+__kernel void read_kernel ( __global uint4 *in,
+                            __global uint  *out,
+                                     uint   ni,
+                                     uint   val,
+                                     uint   nk )
+{
+   if( nk == 0 ) return;
+   
+           uint pcount = 0;
+   __local uint lcount;
+           uint i, idx;
+
+   if( get_local_id(0) == 0)
+       lcount=0;
+
+   barrier( CLK_LOCAL_MEM_FENCE );
+
+   for(int n=0; n<nk; n++)
+      for( i=0, idx=get_global_id(0); i<ni; i++, idx+=get_global_size(0) )
+      {
+         if(in[idx].x == val) pcount++;
+         if(in[idx].y == val) pcount++;
+         if(in[idx].z == val) pcount++;
+         if(in[idx].w == val) pcount++;
+      }
+      
+     (void) atomic_add( &lcount, pcount );
+
+     barrier( CLK_LOCAL_MEM_FENCE );
+
+     if( get_local_id(0) == 0 )
+        out[get_group_id(0)] = lcount/nk;
+}
+
+__kernel void write_kernel ( __global uint  *in,
+                             __global uint4 *out,
+                                      uint  ni,
+                                      uint  val,
+                                      uint  nk )
+{
+   if( nk == 0 ) return;
+   
+   uint i, idx;
+   uint4 pval = (uint4) (val, val, val, val);
+
+   for(int n=0; n<nk; n++)
+      for( i=0, idx=get_global_id(0); i<ni; i++, idx+=get_global_size(0) )
+      {
+         out[idx] = pval;
+      }
+}
+
+                                  
+
diff --git a/External/OpenCL/tests/AMDAPP_SDK/BufferBandwidth/CMakeLists.txt b/External/OpenCL/tests/AMDAPP_SDK/BufferBandwidth/CMakeLists.txt
new file mode 100644
index 0000000000..ab8bc3fcd2
--- /dev/null
+++ b/External/OpenCL/tests/AMDAPP_SDK/BufferBandwidth/CMakeLists.txt
@@ -0,0 +1,25 @@
+# Test parameters
+set(TestName BufferBandwidth)
+set(TestSources BufferBandwidth.cpp Log.cpp Host.cpp Shared.cpp Timer.cpp)
+set(TestData BufferBandwidth_Kernels.cl)
+set(OpenCL_CPPFLAGS -std=c++14 -Wno-c++11-narrowing)
+set(OpenCL_LDFLAGS "")
+set(OpenCL_LIBS "")
+
+# Target to build this specific test.
+add_custom_target(opencl-tests-simple-${TestName})
+
+# Add dependencies for target to build all the tests in opencl-tests-simple. 
+add_dependencies(opencl-tests-simple opencl-tests-simple-${TestName})
+create_local_test(${TestName} "${TestSources}" "${TestData}" "${OpenCL_CPPFLAGS}" "${OpenCL_LDFLAGS}" "${OpenCL_LIBS}")
+
+# Populate & propogate list of tests to run them all using check-opencl-simple.
+list(APPEND OPENCL_SIMPLE_TESTS_LIST "${CMAKE_CURRENT_BINARY_DIR}/${TestName}.test")
+set(OPENCL_SIMPLE_TESTS_LIST ${OPENCL_SIMPLE_TESTS_LIST} PARENT_SCOPE)
+
+# Target to run this specific test.
+add_custom_target(check-opencl-simple-${TestName}
+  COMMAND ${TEST_SUITE_LIT} ${TEST_SUITE_LIT_FLAGS} ${TestName}.test
+  WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+  DEPENDS opencl-tests-simple-${TestName}
+  USES_TERMINAL)
diff --git a/External/OpenCL/tests/AMDAPP_SDK/BufferBandwidth/Host.cpp b/External/OpenCL/tests/AMDAPP_SDK/BufferBandwidth/Host.cpp
new file mode 100644
index 0000000000..4d658f8489
--- /dev/null
+++ b/External/OpenCL/tests/AMDAPP_SDK/BufferBandwidth/Host.cpp
@@ -0,0 +1,621 @@
+/**********************************************************************
+Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+•	Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+•	Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or
+ other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+********************************************************************/
+
+#include "Host.h"
+#include "Timer.h"
+
+#include <iostream>
+#include <iomanip>
+
+#include <stdio.h>
+#if !defined(__MINGW64__)
+#include <emmintrin.h>
+#include <xmmintrin.h>
+#endif
+
+#ifdef _WIN32
+#include <windows.h>
+#include <process.h>
+#else
+#include <unistd.h>
+#include <string.h>
+#endif
+
+#define MTYPE unsigned long
+
+// Suppress the warning #810 if intel compiler is used.
+#if defined(__INTEL_COMPILER) || defined(__MINGW64__)
+#pragma warning(disable : 810)
+#endif
+
+int nWorkers;
+
+typedef struct {
+
+    void (*work_func)( int );
+
+    void *ptr;
+    void *ptr2;
+    unsigned char v;
+    size_t len;
+
+    bool ret;
+    bool noop;
+    bool exit;
+
+} _threadWork;
+
+static _threadWork work[MAXWORKERS];
+
+#ifdef MEM_MULTICORE
+
+static HANDLE      tn[MAXWORKERS];
+static unsigned    tid[MAXWORKERS];
+
+// pad to something > cacheline to avoid false sharing
+static int volatile myBarrier[MAXWORKERS][64]={0};
+
+unsigned int __stdcall myThreadFunc( void * arg )
+{
+    int* idPtr = (int*)(arg);
+    int id = *idPtr;
+    bool exit = false;
+
+    while( !exit )
+    {
+        // counterpart to releaseThreads()
+        while(myBarrier[id][0] != 1)
+            Sleep(0);
+        myBarrier[id][0] = 0;
+
+        if( work[id].work_func != NULL )
+            work[id].work_func( id );
+
+        if( work[id].exit == true )
+            exit = true;
+
+        // counterpart to waitForThreads()
+        myBarrier[id][1] = 1;
+        Sleep(0);
+    }
+
+    return 0;
+}
+#endif
+
+void releaseThreads()
+{
+#ifdef MEM_MULTICORE
+    int n;
+
+    for( n=1; n<nWorkers; n++ )
+    {
+        myBarrier[n][0] = 1;
+    }
+    Sleep(0);
+#endif
+}
+
+void waitForThreads()
+{
+#ifdef MEM_MULTICORE
+    int n;
+
+    for( n = 1; n < nWorkers; n++ )
+    {
+        while( myBarrier[n][1] != 1 )
+            Sleep(0);
+
+        myBarrier[n][1] = 0;
+    }
+#endif
+}
+
+void launchThreads()
+{
+#ifdef MEM_MULTICORE
+    int *nArg = (int*)malloc(sizeof(int) *(nWorkers + 1));
+    int n;
+    for( n = 1; n < nWorkers; n++ )
+    {
+        nArg[n] = n;
+        work[n].exit = false;
+        work[n].work_func = NULL;
+
+        tn[n] = (HANDLE) _beginthreadex( NULL, 0, myThreadFunc, (void *) (&nArg[n]), 0, &tid[n] );
+    }
+
+    // XXX if this isn't here subsequent barrier roundtrips
+    // are more expensive. Why?
+    releaseThreads();
+    waitForThreads();
+    if(nArg)
+        free(nArg);
+#endif
+}
+
+void shutdownThreads()
+{
+#ifdef MEM_MULTICORE
+    int n;
+
+    for( n = 1; n < nWorkers; n++ )
+    {
+        work[n].exit = true;
+        work[n].work_func = NULL;
+    }
+
+    releaseThreads();
+    waitForThreads();
+
+    for(int n = 1; n < nWorkers; n++)
+    {
+        WaitForSingleObject( tn[n], INFINITE );
+    }
+#endif
+}
+
+bool myMasterFunc()
+{
+    releaseThreads();
+
+    // do my chunk of work
+    work[0].work_func( 0 );
+
+    waitForThreads();
+
+    bool ret = true;
+
+    for(int n = 0; n < nWorkers; n++)
+    {
+        if( work[n].ret == false ) ret = false;
+    }
+
+    return( ret );
+}
+
+void readVerifyMemCPU_WF( int id )
+{
+    work[id].ret = readVerifyMemCPU( work[id].ptr, work[id].v, work[id].len );
+}
+
+bool readVerifyMemCPU_MT( void *ptr, unsigned char v, size_t len )
+{
+    size_t blksz = len / nWorkers;
+
+    // distribute work
+    for(int n = 0; n < nWorkers; n++)
+    {
+        work[n].work_func = readVerifyMemCPU_WF;
+        work[n].ptr = (void *) ((char *) ptr + n * blksz);
+        work[n].v = v;
+        work[n].len = blksz;
+    }
+
+    return myMasterFunc();
+}
+
+void memcpy_WF( int id )
+{
+    memcpy( work[id].ptr, work[id].ptr2, work[id].len );
+}
+
+void memcpy_MT( void *ptr, void *ptr2, size_t len )
+{
+    size_t blksz = len / nWorkers;
+
+    // distribute work
+    for(int n = 0; n < nWorkers; n++)
+    {
+        work[n].work_func = memcpy_WF;
+        work[n].ptr = (void *) ((char *) ptr + n * blksz);
+        work[n].ptr2 = (void *) ((char *) ptr2 + n * blksz);
+        work[n].len = blksz;
+    }
+
+    (void) myMasterFunc();
+}
+
+void memset_WF( int id )
+{
+    memset( work[id].ptr, work[id].v, work[id].len );
+}
+
+void memset_MT( void *ptr, unsigned char v, size_t len )
+{
+    size_t blksz = len / nWorkers;
+
+    // distribute work
+    for(int n = 0; n < nWorkers; n++)
+    {
+        work[n].work_func = memset_WF;
+        work[n].ptr = (void *) ((char *) ptr + n * blksz);
+        work[n].v = v;
+        work[n].len = blksz;
+    }
+
+    (void) myMasterFunc();
+}
+
+void empty_WF( int id )
+{
+    volatile int i=0;
+}
+
+void empty_MT()
+{
+    // distribute work
+    for(int n = 0; n < nWorkers; n++) {
+        work[n].work_func = empty_WF;
+    }
+
+    (void) myMasterFunc();
+}
+
+void benchLaunch()
+{
+   CPerfCounter t; t.Reset();
+
+   int nl = 100;
+
+   t.Start();
+
+   for(int n = 0; n < nl; n++)
+   {
+      launchThreads();
+      shutdownThreads();
+   }
+
+   t.Stop();
+
+   std::cout << "Launch speed" << "  " << t.GetElapsedTime() / nl * 1e9 << nWorkers << " ns\n";
+}
+
+void benchBarrier()
+{
+   CPerfCounter t; 
+   t.Reset();
+
+   int nl=100000;
+
+   t.Start();
+
+   for(int n = 0; n < nl; n++)
+   {
+      empty_MT();
+   }
+
+   t.Stop();
+   std::cout << std::setw(21) << std::left << "Barrier speed" << std::setw(7) << t.GetElapsedTime() / nl * 1e9 << " ns\n";
+}
+
+bool readVerifyMemCPU( void *ptr, unsigned char v, size_t len )
+{
+   register MTYPE r = (MTYPE) 0;
+   register MTYPE *p = (MTYPE *) ptr;
+   register size_t i = len / sizeof(MTYPE);
+   register unsigned int idx = 0;
+
+#define UNROLL 32
+
+   while( i >= UNROLL )
+   {
+      _mm_prefetch( (char *) &p[idx + 8], _MM_HINT_NTA );
+
+      r |= p[idx];
+      r |= p[idx + 1];
+      r |= p[idx + 2];
+      r |= p[idx + 3];
+      r |= p[idx + 4];
+      r |= p[idx + 5];
+      r |= p[idx + 6];
+      r |= p[idx + 7];
+
+      _mm_prefetch( (char *) &p[idx + 16], _MM_HINT_NTA );
+
+      r |= p[idx + 8];
+      r |= p[idx + 9];
+      r |= p[idx + 10];
+      r |= p[idx + 11];
+      r |= p[idx + 12];
+      r |= p[idx + 13];
+      r |= p[idx + 14];
+      r |= p[idx + 15];
+
+      _mm_prefetch( (char *) &p[idx + 24], _MM_HINT_NTA );
+
+      r |= p[idx + 16];
+      r |= p[idx + 17];
+      r |= p[idx + 18];
+      r |= p[idx + 19];
+      r |= p[idx + 20];
+      r |= p[idx + 21];
+      r |= p[idx + 22];
+      r |= p[idx + 23];
+
+      _mm_prefetch( (char *) &p[idx + 32], _MM_HINT_NTA );
+
+      r |= p[idx + 24];
+      r |= p[idx + 25];
+      r |= p[idx + 26];
+      r |= p[idx + 27];
+      r |= p[idx + 28];
+      r |= p[idx + 29];
+      r |= p[idx + 30];
+      r |= p[idx + 31];
+
+      i -= UNROLL;
+      idx += UNROLL;
+   }
+
+   // make sure compiler can't optimize
+   static __volatile MTYPE always = r;
+
+   MTYPE val = 0;
+
+   for(int i = 0; i < sizeof(MTYPE); i++)
+       val |= (MTYPE) v << (i*8);
+
+   if( r == val )
+       return true;
+   else
+       return false;
+}
+
+bool readmem2DPitch( void *ptr, unsigned char v, size_t pitch, int rows )
+{
+   register MTYPE r = (MTYPE) 0;
+   register MTYPE *p;
+   register size_t i;
+   register unsigned int idx;
+
+   for( int row = 0; row < rows; row++ )
+   {
+       p = (MTYPE *) ((unsigned char *) ptr + (size_t) row * pitch );
+       idx = 0;
+       i = pitch / sizeof( MTYPE );
+
+#define UNROLL 32
+       while( i >= UNROLL )
+       {
+          _mm_prefetch( (char *) &p[idx + UNROLL * sizeof(MTYPE)], _MM_HINT_NTA );
+
+          r |= p[idx];
+          r |= p[idx + 1];
+          r |= p[idx + 2];
+          r |= p[idx + 3];
+          r |= p[idx + 4];
+          r |= p[idx + 5];
+          r |= p[idx + 6];
+          r |= p[idx + 7];
+          r |= p[idx + 8];
+          r |= p[idx + 9];
+          r |= p[idx + 10];
+          r |= p[idx + 11];
+          r |= p[idx + 12];
+          r |= p[idx + 13];
+          r |= p[idx + 14];
+          r |= p[idx + 15];
+          r |= p[idx + 16];
+          r |= p[idx + 17];
+          r |= p[idx + 18];
+          r |= p[idx + 19];
+          r |= p[idx + 20];
+          r |= p[idx + 21];
+          r |= p[idx + 22];
+          r |= p[idx + 23];
+          r |= p[idx + 24];
+          r |= p[idx + 25];
+          r |= p[idx + 26];
+          r |= p[idx + 27];
+          r |= p[idx + 28];
+          r |= p[idx + 29];
+          r |= p[idx + 30];
+          r |= p[idx + 31];
+
+          i -= UNROLL;
+          idx += UNROLL;
+       }
+   }
+
+   // make sure compiler can't optimize
+   static __volatile MTYPE always = r;
+
+   MTYPE val = 0;
+
+   for(int i = 0; i < sizeof(MTYPE); i++)
+       val |= (MTYPE) v << (i * 8);
+
+   if( r == val )
+       return true;
+   else
+       return false;
+}
+
+void writeMemCPU( void *ptr, unsigned char v, size_t len )
+{
+   register MTYPE r = 0;
+   register MTYPE *p = (MTYPE *) ptr;
+   register size_t i = len / sizeof(MTYPE);
+   register size_t idx = 0;
+
+   for(int i = 0; i < sizeof(MTYPE); i++)
+       r |= (MTYPE) v << (i * 8);
+
+   while( idx < (const size_t) ( len / sizeof(MTYPE) ) )
+   {
+      p[idx] = r;
+      idx++;
+   }
+}
+
+bool readVerifyMemSSE( void *ptr, unsigned char v, size_t len )
+{
+    register __m128i r1 = _mm_setzero_si128();
+    register __m128i r2 = _mm_setzero_si128();
+    register __m128i *p = (__m128i *) ptr;
+    register unsigned int idx = 0;
+
+    while(idx < (const size_t) (len / sizeof(__m128i)) ) 
+    {
+        if( idx < (const size_t) (len / sizeof(__m128i)) - 64 )
+           _mm_prefetch( (char *) &p[idx + 64], _MM_HINT_NTA );
+
+        r1 = _mm_load_si128( &p[idx] );
+        r2 = _mm_or_si128( r1, r2 );
+
+        idx++;
+    }
+
+    // make sure compiler can't optimize
+    static __volatile __m128i always = r2;
+
+    unsigned char res[16];
+    _mm_storeu_si128( (__m128i *) res, r2 );
+
+    bool ret = true;
+
+    unsigned char val;
+    val = v;
+
+    for(int i = 0; i < sizeof( __m128i ); i++)
+        if( res[i] != val ) {
+            ret = false;
+        }
+
+    return ret;
+}
+
+void writeMemSSE ( void *ptr, unsigned char val, size_t len )
+{
+    register const __m128i r1 = _mm_set1_epi8( val );
+    register __m128i *p = (__m128i *) ptr;
+    register size_t idx = 0;
+
+    while( idx < (const size_t) (len / sizeof(__m128i)) )
+    { 
+        _mm_store_si128( &p[idx], r1 );
+        idx++;
+    }
+}
+
+void memset2DPitch( void *ptr, unsigned char val, size_t columns, size_t rows, size_t pitch )
+{
+    for( size_t r = 0; r < rows; r++ )
+        memset( ( unsigned char * ) ptr + r * pitch, val, columns );
+}
+
+void stridePagesCPU( void *ptr, size_t stride, size_t nbytes )
+{
+    register unsigned int *p = ( unsigned int * ) ptr;
+    register size_t i;
+
+    CPerfCounter t;
+    double kTime;
+
+    t.Reset();
+    t.Start();
+
+    for(i = 0; i < nbytes/sizeof(unsigned int); i += stride/sizeof(unsigned int))
+        p[i] = 0;
+
+    t.Stop();
+    kTime = t.GetElapsedTime();
+
+    std::cout << std::setw(21) << std::left << "Page fault" << std::setw(7) << (kTime*1e9) / ((double) nbytes/stride) << " ns" << std::endl;
+}
+
+#define TIMED_LOOP( STRING, EXPR, NBYTES ) \
+{\
+   t.Reset();\
+   t.Start();\
+\
+   int nl = 20;\
+   for( int i = 0; i < nl; i++ )\
+      EXPR;\
+\
+   t.Stop();\
+\
+    std::cout << std::setw(21) << std::left << STRING << (((double) nl*(NBYTES)) / t.GetElapsedTime()) / 1e9 << " GB/s\n"; \
+}
+
+void assessHostMemPerf( void *ptr, void *ptr2, size_t nbytes )
+{
+    CPerfCounter t;
+
+    std::cout << "Host baseline (naive):\n\n";
+
+    double sum = 0.;
+    int ctr = 0;
+
+    for(int i = 0; i < 1e6; i++)
+    {
+        t.Reset();
+        t.Start();
+        t.Stop();
+
+        double e = t.GetElapsedTime();
+
+        if( e > 0. ) {
+            sum += e;
+             ctr++;
+        }
+    }
+
+	std::cout << std::setiosflags(std::ios::fixed) << std::setprecision(2);
+	std::cout << std::setw(21) << std::left << "Timer resolution" 
+              << std::setw(7) << ( sum / (double) ctr ) * 1e9 << " ns\n";
+
+#ifdef _WIN32
+    //Sleep( 1000 );
+#else
+    usleep( 1000 * 1e3 );
+#endif
+    size_t pagesize;
+
+#ifdef _WIN32
+    SYSTEM_INFO system_info;
+
+    GetSystemInfo (&system_info);
+    pagesize = (size_t) system_info.dwPageSize;
+#else
+    pagesize = getpagesize();
+#endif
+
+    stridePagesCPU( ptr, pagesize, nbytes );
+
+#ifdef MEM_MULTICORE
+    benchBarrier();
+
+    std::cout << "\n";
+
+#endif
+
+#if 0
+    TIMED_LOOP( "SSE read", readVerifyMemSSE( ptr, 0, nbytes ), nbytes )
+    TIMED_LOOP( "SSE write", writeMemSSE( ptr, 0, nbytes ), nbytes )
+    TIMED_LOOP( "CPU write", writeMemCPU( ptr, 0, nbytes ), nbytes )
+#endif
+
+    TIMED_LOOP( "CPU read", readVerifyMemCPU_MT( ptr, 0, nbytes ), nbytes )
+
+    TIMED_LOOP( "memcpy()", memcpy_MT( ptr, ptr2, nbytes ), nbytes )
+
+    TIMED_LOOP( "memset(,1,)", memset_MT( ptr, 1, nbytes ), nbytes )
+    TIMED_LOOP( "memset(,0,)", memset_MT( ptr, 0, nbytes ), nbytes )
+
+    std::cout << "\n";
+}
diff --git a/External/OpenCL/tests/AMDAPP_SDK/BufferBandwidth/Host.h b/External/OpenCL/tests/AMDAPP_SDK/BufferBandwidth/Host.h
new file mode 100644
index 0000000000..3d5890572a
--- /dev/null
+++ b/External/OpenCL/tests/AMDAPP_SDK/BufferBandwidth/Host.h
@@ -0,0 +1,50 @@
+/**********************************************************************
+Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+•	Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+•	Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or
+ other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+********************************************************************/
+
+#ifndef _HOST_H_
+#define _HOST_H_
+
+#include <stddef.h>
+
+#ifdef _WIN32
+#define MEM_MULTICORE
+#define MAXWORKERS 32
+#else
+#define MAXWORKERS 1
+#endif
+
+extern int nWorkers;
+
+bool readVerifyMemCPU( void *, unsigned char, size_t );
+bool readVerifyMemCPU_MT( void *, unsigned char, size_t );
+void memset_MT( void *, unsigned char, size_t );
+void memcpy_MT( void *, void *, size_t );
+void writeMemCPU( void *, unsigned char, size_t );
+bool readVerifyMemSSE( void *, unsigned char, size_t );
+void writeMemSSE ( void *, unsigned char, size_t );
+
+bool readmem2DPitch( void *, unsigned char, size_t, int );
+void memset2DPitch( void *, unsigned char, size_t, size_t, size_t );
+
+void runon( unsigned int );
+void stridePagesCPU( void *, size_t );
+void assessHostMemPerf( void *, void *, size_t );
+
+void benchThreads();
+void launchThreads();
+void shutdownThreads();
+
+#endif // _HOST_H_
diff --git a/External/OpenCL/tests/AMDAPP_SDK/BufferBandwidth/Log.cpp b/External/OpenCL/tests/AMDAPP_SDK/BufferBandwidth/Log.cpp
new file mode 100644
index 0000000000..146799e4b5
--- /dev/null
+++ b/External/OpenCL/tests/AMDAPP_SDK/BufferBandwidth/Log.cpp
@@ -0,0 +1,172 @@
+/**********************************************************************
+Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+•	Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+•	Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or
+ other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+********************************************************************/
+
+#include "Log.h"
+
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <iostream>
+#include <iomanip>
+
+void Sample::setMsg( const char *fmt, const char *msg )
+{
+    _isMsg = true;
+
+    _fmt = new char[ strlen( fmt ) + 1 ];
+    strcpy( _fmt, fmt );
+    _msg = new char[ strlen( msg ) + 1 ];
+    strcpy( _msg, msg );
+}
+
+void Sample::setTimer( const char *msg, double timer, unsigned int nbytes, int loops )
+{
+    _isMsg = false;
+    _timer = timer;
+
+    if( loops != 0 ) _loops = loops;
+    if( nbytes > 0 ) _nbytes = nbytes;
+
+    if ( strlen( msg ) > 0 )
+    {
+        _msg = new char[ strlen( msg ) + 1 ];
+        strcpy( _msg, msg );
+    }
+}
+
+void Sample::printSample( int firstColumnWidth )
+{
+    if(firstColumnWidth == 0)	// Default column width
+			firstColumnWidth = 30;
+
+	if( _isMsg == true )
+        printf( _fmt, _msg );
+    else
+    {
+		double bwd = (((double) _nbytes * _loops ) / _timer ) / 1e9;
+
+		std::cout << " ";
+		for(int j=0; j<(firstColumnWidth+5); j++)
+			std::cout << "-";
+		std::cout << "|";
+		for(int j=0; j<15; j++)
+			std::cout << "-";
+		std::cout << std::endl;
+
+		std::cout << std::setprecision(3);
+		std::cout << " " << std::setw(firstColumnWidth+5) << std::left << _msg
+					<< "| " << bwd << std::endl;
+
+    }
+}
+
+TestLog::TestLog( int nSamples ) : _logIdx(0), 
+                                   _logLoops(0), 
+                                   _logLoopEntries(0),
+                                   _logLoopTimers(0),
+								   _maxMsgWidth(0)
+{
+    _samples = new Sample[ nSamples ];
+}
+
+void TestLog::loopMarker()
+{
+    _logLoopEntries = 0;
+    _logLoopTimers = 0;
+    _logLoops++;
+}
+
+void TestLog::Msg( const char *format, const char *msg )
+{
+    _samples[ _logIdx++ ].setMsg( format, msg );
+    _logLoopEntries++;
+}
+
+void TestLog::Error( const char *format, const char *msg )
+{
+    _samples[ _logIdx ].setMsg( format, msg );
+    _samples[ _logIdx++ ].setErr();
+    _logLoopEntries++;
+}
+
+void TestLog::Timer( const char *msg, double timer, unsigned int nbytes, int loops )
+{
+	_samples[ _logIdx++ ].setTimer( msg, timer, nbytes, loops );
+
+	int msgLen = (int)strlen(msg);
+	if(msgLen > _maxMsgWidth)
+		_maxMsgWidth = msgLen;
+
+    _logLoopEntries++; 
+    _logLoopTimers++;
+}
+
+void TestLog::printLog( void )
+{
+    int idx = 0;
+
+    printf( "\nLOOP ITERATIONS\n" );
+    printf( "---------------\n" );
+
+    for(int l = 0; l < _logLoops; l++)
+    {
+       printf("\nLoop %d\n\n", l );
+
+       for( int i = 0; i < _logLoopEntries; i++ )
+          _samples[ idx++ ].printSample(_maxMsgWidth);
+    }
+}
+
+void TestLog::printSummary( int skip )
+{
+   printf( "\nAVERAGES (over loops %d - %d, use -l for complete log)\n", skip, _logLoops-1 );
+   printf( "--------\n" );
+
+   for( int i = 0; i < _logLoopEntries; i++ )
+   {
+       if( _samples[ i ].isMsg() )
+       {
+           bool foundError = false;
+
+           for( int nl = 0; nl < _logLoops; nl++ )
+           {
+               int current = i + nl * _logLoopEntries;
+
+               if( _samples[ current ].isErr() )
+               {
+                   _samples[ current ].printSample();
+                   foundError = true;
+                   break;
+               }
+           }
+
+           if( !foundError )
+               _samples[ i ].printSample();
+       }
+       else
+       {
+           double sum = 0.;
+
+           for( int nl = skip; nl < _logLoops; nl++ )
+           {
+               sum += _samples[ i + nl * _logLoopEntries ].getTimer();
+           }
+
+           _samples[ i ].setTimer( "", sum / ( _logLoops - skip ), 0, 0 );
+           _samples[ i ].printSample(_maxMsgWidth);
+       }
+   }
+}
diff --git a/External/OpenCL/tests/AMDAPP_SDK/BufferBandwidth/Log.h b/External/OpenCL/tests/AMDAPP_SDK/BufferBandwidth/Log.h
new file mode 100644
index 0000000000..850b4a44d4
--- /dev/null
+++ b/External/OpenCL/tests/AMDAPP_SDK/BufferBandwidth/Log.h
@@ -0,0 +1,74 @@
+/**********************************************************************
+Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+•	Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+•	Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or
+ other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+********************************************************************/
+
+#ifndef _LOG_H_
+#define _LOG_H_
+
+extern int nBytes;
+
+class Sample {
+
+public:
+
+    Sample() : _isMsg(false), _isErr(false), _timer(0.), _msg(0), _loops(1) {}
+    ~Sample() {}
+
+    void   setMsg( const char *, const char * );
+    void   setErr( void ) { _isErr = true; }
+    bool   isMsg( void ) { return _isMsg; }
+    bool   isErr( void ) { return _isErr; }
+    void   setTimer( const char *, double, unsigned int, int );
+    double getTimer( void ) { return _timer; }
+    void   printSample ( int firstColumnWidth=0 );
+
+private:
+
+    bool          _isMsg;
+    bool          _isErr;
+    double        _timer;
+    unsigned int  _nbytes;
+    int           _loops;
+    char *        _fmt;
+    char *        _msg;
+};
+
+class TestLog {
+
+public:
+
+    TestLog( int );
+    ~TestLog() {}
+
+    void loopMarker( void );
+    void Msg( const char *, const char * );
+    void Error( const char *, const char * );
+    void Timer( const char *, double, unsigned int, int );
+
+    void printLog( void );
+    void printSummary( int );
+
+private:
+
+    int _logIdx;
+    int _logLoops;
+    int _logLoopEntries;
+    int _logLoopTimers;
+	int _maxMsgWidth;
+ 
+    Sample *_samples;
+};
+
+#endif // _LOG_H_
diff --git a/External/OpenCL/tests/AMDAPP_SDK/BufferBandwidth/Shared.cpp b/External/OpenCL/tests/AMDAPP_SDK/BufferBandwidth/Shared.cpp
new file mode 100644
index 0000000000..d8d4881f77
--- /dev/null
+++ b/External/OpenCL/tests/AMDAPP_SDK/BufferBandwidth/Shared.cpp
@@ -0,0 +1,376 @@
+/**********************************************************************
+Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+•	Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+•	Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or
+ other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+********************************************************************/
+
+#include "Shared.h"
+#include "Timer.h"
+
+cl_command_queue queue;
+cl_context       context;
+cl_kernel        read_kernel;
+cl_kernel        write_kernel;
+int              devnum;
+char             devname[256];
+
+cl_uint deviceMaxComputeUnits;
+
+cl_mem_flags inFlags = 0;
+cl_mem_flags outFlags = 0;
+cl_mem_flags copyFlags = 0;
+
+struct _flags flags[] = {
+             { CL_MEM_READ_ONLY,              "CL_MEM_READ_ONLY" },
+             { CL_MEM_WRITE_ONLY,             "CL_MEM_WRITE_ONLY" },
+             { CL_MEM_READ_WRITE,             "CL_MEM_READ_WRITE" },
+             { CL_MEM_USE_HOST_PTR,           "CL_MEM_USE_HOST_PTR" },
+             { CL_MEM_COPY_HOST_PTR,          "CL_MEM_COPY_HOST_PTR" },
+             { CL_MEM_ALLOC_HOST_PTR,         "CL_MEM_ALLOC_HOST_PTR" },
+             // { CL_MEM_USE_PERSISTENT_MEM_AMD, "CL_MEM_USE_PERSISTENT_MEM_AMD"},
+             { CL_MEM_HOST_WRITE_ONLY,        "CL_MEM_HOST_WRITE_ONLY"},
+             { CL_MEM_HOST_READ_ONLY,         "CL_MEM_HOST_READ_ONLY"},
+             { CL_MEM_HOST_NO_ACCESS,         "CL_MEM_HOST_NO_ACCESS"} };
+
+int nFlags = sizeof(flags) / sizeof(flags[0]);
+
+const char *cluErrorString(cl_int err) {
+
+   switch(err) {
+
+      case CL_SUCCESS: return "CL_SUCCESS";
+      case CL_DEVICE_NOT_FOUND: return "CL_DEVICE_NOT_FOUND";
+      case CL_DEVICE_NOT_AVAILABLE: return "CL_DEVICE_NOT_AVAILABLE";
+      case CL_COMPILER_NOT_AVAILABLE: return
+                                       "CL_COMPILER_NOT_AVAILABLE";
+      case CL_MEM_OBJECT_ALLOCATION_FAILURE: return
+                                       "CL_MEM_OBJECT_ALLOCATION_FAILURE";
+      case CL_OUT_OF_RESOURCES: return "CL_OUT_OF_RESOURCES";
+      case CL_OUT_OF_HOST_MEMORY: return "CL_OUT_OF_HOST_MEMORY";
+      case CL_PROFILING_INFO_NOT_AVAILABLE: return
+                                       "CL_PROFILING_INFO_NOT_AVAILABLE";
+      case CL_MEM_COPY_OVERLAP: return "CL_MEM_COPY_OVERLAP";
+      case CL_IMAGE_FORMAT_MISMATCH: return "CL_IMAGE_FORMAT_MISMATCH";
+      case CL_IMAGE_FORMAT_NOT_SUPPORTED: return
+                                       "CL_IMAGE_FORMAT_NOT_SUPPORTED";
+      case CL_BUILD_PROGRAM_FAILURE: return "CL_BUILD_PROGRAM_FAILURE";
+      case CL_MAP_FAILURE: return "CL_MAP_FAILURE";
+      case CL_INVALID_VALUE: return "CL_INVALID_VALUE";
+      case CL_INVALID_DEVICE_TYPE: return "CL_INVALID_DEVICE_TYPE";
+      case CL_INVALID_PLATFORM: return "CL_INVALID_PLATFORM";
+      case CL_INVALID_DEVICE: return "CL_INVALID_DEVICE";
+      case CL_INVALID_CONTEXT: return "CL_INVALID_CONTEXT";
+      case CL_INVALID_QUEUE_PROPERTIES: return "CL_INVALID_QUEUE_PROPERTIES";
+      case CL_INVALID_COMMAND_QUEUE: return "CL_INVALID_COMMAND_QUEUE";
+      case CL_INVALID_HOST_PTR: return "CL_INVALID_HOST_PTR";
+      case CL_INVALID_MEM_OBJECT: return "CL_INVALID_MEM_OBJECT";
+      case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: return
+                                       "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
+      case CL_INVALID_IMAGE_SIZE: return "CL_INVALID_IMAGE_SIZE";
+      case CL_INVALID_SAMPLER: return "CL_INVALID_SAMPLER";
+      case CL_INVALID_BINARY: return "CL_INVALID_BINARY";
+      case CL_INVALID_BUILD_OPTIONS: return "CL_INVALID_BUILD_OPTIONS";
+      case CL_INVALID_PROGRAM: return "CL_INVALID_PROGRAM";
+      case CL_INVALID_PROGRAM_EXECUTABLE: return
+                                       "CL_INVALID_PROGRAM_EXECUTABLE";
+      case CL_INVALID_KERNEL_NAME: return "CL_INVALID_KERNEL_NAME";
+      case CL_INVALID_KERNEL_DEFINITION: return "CL_INVALID_KERNEL_DEFINITION";
+      case CL_INVALID_KERNEL: return "CL_INVALID_KERNEL";
+      case CL_INVALID_ARG_INDEX: return "CL_INVALID_ARG_INDEX";
+      case CL_INVALID_ARG_VALUE: return "CL_INVALID_ARG_VALUE";
+      case CL_INVALID_ARG_SIZE: return "CL_INVALID_ARG_SIZE";
+      case CL_INVALID_KERNEL_ARGS: return "CL_INVALID_KERNEL_ARGS";
+      case CL_INVALID_WORK_DIMENSION: return "CL_INVALID_WORK_DIMENSION";
+      case CL_INVALID_WORK_GROUP_SIZE: return "CL_INVALID_WORK_GROUP_SIZE";
+      case CL_INVALID_WORK_ITEM_SIZE: return "CL_INVALID_WORK_ITEM_SIZE";
+      case CL_INVALID_GLOBAL_OFFSET: return "CL_INVALID_GLOBAL_OFFSET";
+      case CL_INVALID_EVENT_WAIT_LIST: return "CL_INVALID_EVENT_WAIT_LIST";
+      case CL_INVALID_EVENT: return "CL_INVALID_EVENT";
+      case CL_INVALID_OPERATION: return "CL_INVALID_OPERATION";
+      case CL_INVALID_GL_OBJECT: return "CL_INVALID_GL_OBJECT";
+      case CL_INVALID_BUFFER_SIZE: return "CL_INVALID_BUFFER_SIZE";
+      case CL_INVALID_MIP_LEVEL: return "CL_INVALID_MIP_LEVEL";
+      case CL_INVALID_GLOBAL_WORK_SIZE: return "CL_INVALID_GLOBAL_WORK_SIZE";
+
+      default: return "UNKNOWN CL ERROR CODE";
+   }
+}
+
+cl_int spinForEventsComplete( cl_uint num_events, cl_event *event_list )
+{
+    cl_int ret = 0;
+#if 0
+    ret = clWaitForEvents( num_events, event_list );
+#else
+    cl_int param_value;
+    size_t param_value_size_ret;
+
+    for( cl_uint e=0; e < num_events; e++ )
+    {
+        while(1)
+        {
+            ret |= clGetEventInfo( event_list[ e ], 
+                                   CL_EVENT_COMMAND_EXECUTION_STATUS,
+                                   sizeof( cl_int ),
+                                   &param_value,
+                                   &param_value_size_ret );
+
+            if( param_value == CL_COMPLETE )
+                break;
+        }
+    }
+#endif
+
+    for( cl_uint e=0; e < num_events; e++ )
+        clReleaseEvent( event_list[e] );
+    return ret;
+}
+
+void checkCLFeatures(cl_device_id device)
+{
+        // Check device extensions
+    char* deviceExtensions = NULL;;
+    size_t extStringSize = 0;
+
+    // Get device extensions 
+    clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 0, NULL, &extStringSize);
+    deviceExtensions = new char[extStringSize];
+    if(NULL == deviceExtensions){
+        fprintf( stderr, "Failed to allocate memory(deviceExtensions)\n");
+        exit(-1);
+    }
+    clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, extStringSize, deviceExtensions, NULL);
+
+    // Check if cl_khr_fp64 extension is supported 
+    if(!strstr(deviceExtensions, "cl_khr_local_int32_base_atomics"))
+    {
+        fprintf( stderr, "Device does not support cl_khr_local_int32_base_atomics extension!\n");
+        delete deviceExtensions;
+        exit(0);
+    }
+    delete deviceExtensions;
+
+    // Get OpenCL device version
+    char deviceVersion[1024];
+    clGetDeviceInfo(device, CL_DEVICE_VERSION, sizeof(deviceVersion), deviceVersion, NULL);
+
+    std::string deviceVersionStr = std::string(deviceVersion);
+    size_t vStart = deviceVersionStr.find(" ", 0);
+    size_t vEnd = deviceVersionStr.find(" ", vStart + 1);
+    std::string vStrVal = deviceVersionStr.substr(vStart + 1, vEnd - vStart - 1);
+
+    // Check of OPENCL_C_VERSION if device version is 1.1 or later
+#ifdef CL_VERSION_1_1
+    if(vStrVal.compare("1.0") > 0)
+    {
+        //Get OPENCL_C_VERSION 
+        clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, sizeof(deviceVersion), deviceVersion, NULL);
+
+        // Exit if OpenCL C device version is 1.0
+        deviceVersionStr = std::string(deviceVersion);
+        vStart = deviceVersionStr.find(" ", 0);
+        vStart = deviceVersionStr.find(" ", vStart + 1);
+        vEnd = deviceVersionStr.find(" ", vStart + 1);
+        vStrVal = deviceVersionStr.substr(vStart + 1, vEnd - vStart - 1);
+        if(vStrVal.compare("1.0") <= 0)
+        {
+            fprintf( stderr, "Unsupported device! Required CL_DEVICE_OPENCL_C_VERSION as 1.1\n");
+            exit(0);
+        }
+    }
+    else
+    {
+        fprintf( stderr, "Unsupported device! Required CL_DEVICE_OPENCL_C_VERSION as 1.1\n");
+        exit(0);
+    }
+#else
+    fprintf( stderr, "Unsupported device! Required CL_DEVICE_OPENCL_C_VERSION as 1.1\n");
+    exit(0);
+#endif
+
+    return;
+}
+
+void initCL( char *kernel_file, double &setupTime )
+{
+    // Get a platform, device, context and queue
+
+    cl_platform_id   platform = NULL;
+    cl_device_id     devices[128];
+    cl_device_id     device;
+    cl_uint          num_devices;
+    cl_int           ret;
+
+    cl_device_type   devs[] = { CL_DEVICE_TYPE_CPU,
+                                CL_DEVICE_TYPE_GPU };
+    cl_uint numPlatforms;
+    cl_int status = clGetPlatformIDs(0, NULL, &numPlatforms);
+
+	CPerfCounter t1;
+	t1.Reset();
+    t1.Start();
+
+	if(status != 0)
+    {
+		printf("clGetPlatformIDs failed.\n");
+		exit(FAILURE);
+	}
+    if (0 < numPlatforms) 
+    {
+        cl_platform_id* platforms = new cl_platform_id[numPlatforms];
+        status = clGetPlatformIDs(numPlatforms, platforms, NULL);
+		if(status != 0)
+		{
+			printf("clGetPlatformIDs failed.\n");
+			exit(FAILURE);
+		}
+
+            char platformName[100];
+            for (unsigned i = 0; i < numPlatforms; ++i) 
+            {
+                status = clGetPlatformInfo(platforms[i],
+                                           CL_PLATFORM_VENDOR,
+                                           sizeof(platformName),
+                                           platformName,
+                                           NULL);
+				if(status != 0)
+				{
+					printf("clGetPlatformIDs failed.\n");
+					exit(FAILURE);
+				}
+
+                platform = platforms[i];
+                if (!strcmp(platformName, "Advanced Micro Devices, Inc.")) 
+                {
+                    break;
+                }
+            }
+            std::cout << "Platform found : " << platformName << "\n";
+        delete[] platforms;
+    }
+
+    if(NULL == platform)
+    {
+        printf("NULL platform found so Exiting Application.");
+        exit(FAILURE);
+    }
+
+    ret = clGetDeviceIDs( platform,
+                          devs[1],
+                          128,
+                          devices,
+                          &num_devices );
+    if((ret == CL_DEVICE_NOT_FOUND) || (num_devices == 0))
+    {
+      ret = clGetDeviceIDs( platform,
+                            devs[0],
+                            128,
+                            devices,
+                            &num_devices );
+      if((ret == CL_DEVICE_NOT_FOUND) || (num_devices == 0)) {
+        fprintf( stderr, "No valid OpenCL Device available.");
+        exit(FAILURE);
+      }
+    }
+    ASSERT_CL_RETURN( ret );
+
+    device = devices[devnum];
+
+    ret = clGetDeviceInfo( device,
+                           CL_DEVICE_MAX_COMPUTE_UNITS,
+                           sizeof(cl_uint),
+                           &deviceMaxComputeUnits,
+                           NULL);
+
+    ASSERT_CL_RETURN( ret );
+
+    ret = clGetDeviceInfo( device,
+                           CL_DEVICE_NAME,
+                           256,
+                           devname,
+                           NULL);
+
+    ASSERT_CL_RETURN( ret );
+
+    context = clCreateContext( NULL,
+                               1,
+                               &device,
+                               NULL, NULL, NULL );
+
+    queue = clCreateCommandQueue( context,
+                                 device,
+                                 0,
+                                 NULL );
+
+    // Minimal error check.
+
+    if( queue == NULL ) {
+        fprintf( stderr, "Compute device setup failed\n");
+        exit(FAILURE);
+    }
+
+    // Check for OpenCL features and extensions.
+    checkCLFeatures(device);
+
+    // Perform runtime source compilation, and obtain kernel entry points.
+    FILE *fp = fopen( kernel_file, "rb" );
+
+    if( fp == NULL )
+    {
+        fprintf( stderr, "%s:%d: can't open kernel file: %s\n", \
+             __FILE__, __LINE__, strerror( errno ));\
+        exit(FAILURE);
+    }
+
+    fseek( fp, 0, SEEK_END );
+    const size_t size = ftell( fp );
+    const char *kernel_source = (const char *) malloc( size );
+
+    rewind( fp );
+    fread( (void *) kernel_source, 1, size, fp );
+    
+    cl_program       program;
+
+    program = clCreateProgramWithSource( context,
+                                         1,
+                                         &kernel_source,
+                                         &size, 
+                                         NULL );
+
+    ret = clBuildProgram( program, 1, &device, NULL, NULL, NULL );
+
+    static char buf[0x10000]={0};
+
+    clGetProgramBuildInfo( program,
+                           device,
+                           CL_PROGRAM_BUILD_LOG,
+                           0x10000,
+                           buf,
+                           NULL );
+
+    std::cout << buf << std::endl;
+
+    ASSERT_CL_RETURN( ret );
+
+    read_kernel = clCreateKernel( program, "read_kernel", &ret );
+
+    ASSERT_CL_RETURN( ret );
+
+    write_kernel = clCreateKernel( program, "write_kernel", &ret );
+
+    ASSERT_CL_RETURN( ret );
+
+	t1.Stop();
+	setupTime = t1.GetElapsedTime();
+
+}
diff --git a/External/OpenCL/tests/AMDAPP_SDK/BufferBandwidth/Shared.h b/External/OpenCL/tests/AMDAPP_SDK/BufferBandwidth/Shared.h
new file mode 100644
index 0000000000..90c5e63499
--- /dev/null
+++ b/External/OpenCL/tests/AMDAPP_SDK/BufferBandwidth/Shared.h
@@ -0,0 +1,78 @@
+/**********************************************************************
+Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+•	Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+•	Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or
+ other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+********************************************************************/
+
+#ifndef _SHARED_H_
+#define _SHARED_H_
+
+#define SUCCESS 0
+#define FAILURE 1
+#define EXPECTED_FAILURE 2
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <assert.h>
+#include <string.h>
+#include <iostream>
+#include <fstream>
+#include <iomanip>
+#include <sstream>
+
+#include <CL/opencl.h>
+
+#ifdef _WIN32
+#include <windows.h>
+#endif
+
+#if defined(__MINGW32__) && !defined(__MINGW64_VERSION_MAJOR)
+#define _aligned_malloc __mingw_aligned_malloc 
+#define _aligned_free  __mingw_aligned_free 
+#endif // __MINGW32__  and __MINGW64_VERSION_MAJOR
+
+#include <malloc.h>
+
+
+
+#define ASSERT_CL_RETURN( ret )\
+   if( (ret) != CL_SUCCESS )\
+   {\
+      fprintf( stderr, "%s:%d: error: %s\n", \
+             __FILE__, __LINE__, cluErrorString( (ret) ));\
+      exit(FAILURE);\
+   }
+
+extern cl_mem_flags inFlags;
+extern cl_mem_flags outFlags;
+extern cl_mem_flags copyFlags;
+
+extern struct _flags { cl_mem_flags f;
+                       const char  *s; } flags[];
+extern int nFlags;
+
+extern cl_command_queue queue;
+extern cl_context       context;
+extern cl_kernel        read_kernel;
+extern cl_kernel        write_kernel;
+extern cl_uint          deviceMaxComputeUnits;
+extern int              devnum;
+extern char             devname[];
+
+const char *cluErrorString(cl_int);
+cl_int      spinForEventsComplete( cl_uint, cl_event * );
+void        initCL( char *, double&);
+
+#endif // _SHARED_H_
diff --git a/External/OpenCL/tests/AMDAPP_SDK/BufferBandwidth/Timer.cpp b/External/OpenCL/tests/AMDAPP_SDK/BufferBandwidth/Timer.cpp
new file mode 100644
index 0000000000..cd898f5f52
--- /dev/null
+++ b/External/OpenCL/tests/AMDAPP_SDK/BufferBandwidth/Timer.cpp
@@ -0,0 +1,91 @@
+/**********************************************************************
+Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+•	Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+•	Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or
+ other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+********************************************************************/
+
+#include "Timer.h"
+
+#ifdef _WIN32
+#include <windows.h>
+#else
+#include <sys/time.h>
+#include <time.h>
+#endif
+
+CPerfCounter::CPerfCounter() : _clocks(0), _start(0)
+{
+
+#ifdef _WIN32
+    QueryPerformanceFrequency((LARGE_INTEGER *)&_freq);
+#else
+    _freq = 1000;
+#endif
+
+}
+
+CPerfCounter::~CPerfCounter()
+{
+    // EMPTY!
+}
+
+void
+CPerfCounter::Start(void)
+{
+
+#ifdef _WIN32
+    QueryPerformanceCounter((LARGE_INTEGER *)&_start);
+#else
+    struct timespec s;
+    clock_gettime( CLOCK_REALTIME, &s );
+    _start = (i64)s.tv_sec * 1e9 + (i64)s.tv_nsec;
+#endif
+
+}
+
+void
+CPerfCounter::Stop(void)
+{
+    i64 n;
+
+#ifdef _WIN32
+    QueryPerformanceCounter((LARGE_INTEGER *)&n);
+#else
+    struct timespec s;
+    clock_gettime( CLOCK_REALTIME, &s );
+    n = (i64)s.tv_sec * 1e9 + (i64)s.tv_nsec;
+#endif
+
+    n -= _start;
+    _start = 0;
+    _clocks += n;
+}
+
+void
+CPerfCounter::Reset(void)
+{
+
+    _clocks = 0;
+}
+
+double
+CPerfCounter::GetElapsedTime(void)
+{
+#if _WIN32
+    return (double)_clocks / (double) _freq;
+#else
+    return (double)_clocks / (double) 1e9;
+#endif
+
+}
+
diff --git a/External/OpenCL/tests/AMDAPP_SDK/BufferBandwidth/Timer.h b/External/OpenCL/tests/AMDAPP_SDK/BufferBandwidth/Timer.h
new file mode 100644
index 0000000000..36ca81e2af
--- /dev/null
+++ b/External/OpenCL/tests/AMDAPP_SDK/BufferBandwidth/Timer.h
@@ -0,0 +1,93 @@
+/**********************************************************************
+Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+•	Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+•	Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or
+ other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+********************************************************************/
+
+#ifndef _TIMER_H_
+#define _TIMER_H_
+/**
+ * \file Timer.h
+ * \brief A timer class that provides a cross platform timer for use
+ * in timing code progress with a high degree of accuracy.
+ */
+#ifdef _WIN32
+/**
+ * \typedef __int64 i64
+ * \brief Maps the windows 64 bit integer to a uniform name
+ */
+#if defined(__MINGW64__) || defined(__MINGW32__)
+typedef long long i64;
+#else
+typedef __int64 i64;
+#endif
+#else
+/**
+ * \typedef long long i64
+ * \brief Maps the linux 64 bit integer to a uniform name
+ */
+typedef long long i64;
+#endif
+
+/**
+ * \class CPerfCounter
+ * \brief Counter that provides a fairly accurate timing mechanism for both
+ * windows and linux. This timer is used extensively in all the samples.
+ */
+class CPerfCounter {
+
+public:
+    /**
+     * \fn CPerfCounter()
+     * \brief Constructor for CPerfCounter that initializes the class
+     */
+    CPerfCounter();
+    /**
+     * \fn ~CPerfCounter()
+     * \brief Destructor for CPerfCounter that cleans up the class
+     */
+    ~CPerfCounter();
+    /**
+     * \fn void Start(void)
+     * \brief Start the timer
+     * \sa Stop(), Reset()
+     */
+    void Start(void);
+    /**
+     * \fn void Stop(void)
+     * \brief Stop the timer
+     * \sa Start(), Reset()
+     */
+    void Stop(void);
+    /**
+     * \fn void Reset(void)
+     * \brief Reset the timer to 0
+     * \sa Start(), Stop()
+     */
+    void Reset(void);
+    /**
+     * \fn double GetElapsedTime(void)
+     * \return Amount of time that has accumulated between the \a Start()
+     * and \a Stop() function calls
+     */
+    double GetElapsedTime(void);
+
+private:
+
+    i64 _freq;
+    i64 _clocks;
+    i64 _start;
+};
+
+#endif // _TIMER_H_
+
diff --git a/External/OpenCL/tests/AMDAPP_SDK/BufferBandwidth/docs/BufferBandwidth.pdf b/External/OpenCL/tests/AMDAPP_SDK/BufferBandwidth/docs/BufferBandwidth.pdf
new file mode 100644
index 0000000000..799cfe62fa
Binary files /dev/null and b/External/OpenCL/tests/AMDAPP_SDK/BufferBandwidth/docs/BufferBandwidth.pdf differ
diff --git a/External/OpenCL/tests/AMDAPP_SDK/CMakeLists.txt b/External/OpenCL/tests/AMDAPP_SDK/CMakeLists.txt
new file mode 100644
index 0000000000..1eb4ceeac6
--- /dev/null
+++ b/External/OpenCL/tests/AMDAPP_SDK/CMakeLists.txt
@@ -0,0 +1,6 @@
+add_subdirectory(BufferBandwidth)
+add_subdirectory(HelloWorld)
+add_subdirectory(Template)
+
+# Propogate the list of tests to run.
+set(OPENCL_SIMPLE_TESTS_LIST ${OPENCL_SIMPLE_TESTS_LIST} PARENT_SCOPE)
diff --git a/External/OpenCL/tests/AMDAPP_SDK/HelloWorld/CMakeLists.txt b/External/OpenCL/tests/AMDAPP_SDK/HelloWorld/CMakeLists.txt
new file mode 100644
index 0000000000..378d9950e7
--- /dev/null
+++ b/External/OpenCL/tests/AMDAPP_SDK/HelloWorld/CMakeLists.txt
@@ -0,0 +1,25 @@
+# Test parameters
+set(TestName HelloWorld)
+set(TestSources HelloWorld.cpp)
+set(TestData HelloWorld_Kernel.cl)
+set(OpenCL_CPPFLAGS "")
+set(OpenCL_LDFLAGS "")
+set(OpenCL_LIBS "")
+
+# Target to build this specific test.
+add_custom_target(opencl-tests-simple-${TestName})
+
+# Add dependencies for target to build all the tests in opencl-tests-simple. 
+add_dependencies(opencl-tests-simple opencl-tests-simple-${TestName})
+create_local_test(${TestName} "${TestSources}" "${TestData}" "${OpenCL_CPPFLAGS}" "${OpenCL_LDFLAGS}" "${OpenCL_LIBS}")
+
+# Populate & propogate list of tests to run them all using check-opencl-simple.
+list(APPEND OPENCL_SIMPLE_TESTS_LIST "${CMAKE_CURRENT_BINARY_DIR}/${TestName}.test")
+set(OPENCL_SIMPLE_TESTS_LIST ${OPENCL_SIMPLE_TESTS_LIST} PARENT_SCOPE)
+
+# Target to run this specific test.
+add_custom_target(check-opencl-simple-${TestName}
+  COMMAND ${TEST_SUITE_LIT} ${TEST_SUITE_LIT_FLAGS} ${TestName}.test
+  WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+  DEPENDS opencl-tests-simple-${TestName}
+  USES_TERMINAL)
diff --git a/External/OpenCL/tests/AMDAPP_SDK/HelloWorld/HelloWorld.cpp b/External/OpenCL/tests/AMDAPP_SDK/HelloWorld/HelloWorld.cpp
new file mode 100644
index 0000000000..b4abeadf83
--- /dev/null
+++ b/External/OpenCL/tests/AMDAPP_SDK/HelloWorld/HelloWorld.cpp
@@ -0,0 +1,171 @@
+/**********************************************************************
+Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+•	Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+•	Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or
+ other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+********************************************************************/
+
+// For clarity,error checking has been omitted.
+
+#include <CL/cl.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <iostream>
+#include <string>
+#include <fstream>
+
+#define SUCCESS 0
+#define FAILURE 1
+
+using namespace std;
+
+/* convert the kernel file into a string */
+int convertToString(const char *filename, std::string& s)
+{
+	size_t size;
+	char*  str;
+	std::fstream f(filename, (std::fstream::in | std::fstream::binary));
+
+	if(f.is_open())
+	{
+		size_t fileSize;
+		f.seekg(0, std::fstream::end);
+		size = fileSize = (size_t)f.tellg();
+		f.seekg(0, std::fstream::beg);
+		str = new char[size+1];
+		if(!str)
+		{
+			f.close();
+			return 0;
+		}
+
+		f.read(str, fileSize);
+		f.close();
+		str[size] = '\0';
+		s = str;
+		delete[] str;
+		return 0;
+	}
+	cout<<"Error: failed to open file\n:"<<filename<<endl;
+	return FAILURE;
+}
+
+int main(int argc, char* argv[])
+{
+
+	/*Step1: Getting platforms and choose an available one.*/
+	cl_uint numPlatforms;	//the NO. of platforms
+	cl_platform_id platform = NULL;	//the chosen platform
+	cl_int	status = clGetPlatformIDs(0, NULL, &numPlatforms);
+	if (status != CL_SUCCESS)
+	{
+		cout << "Error: Getting platforms!" << endl;
+		return FAILURE;
+	}
+
+	/*For clarity, choose the first available platform. */
+	if(numPlatforms > 0)
+	{
+		cl_platform_id* platforms = (cl_platform_id* )malloc(numPlatforms* sizeof(cl_platform_id));
+		status = clGetPlatformIDs(numPlatforms, platforms, NULL);
+		platform = platforms[0];
+		free(platforms);
+	}
+
+	/*Step 2:Query the platform and choose the first GPU device if has one.Otherwise use the CPU as device.*/
+	cl_uint				numDevices = 0;
+	cl_device_id        *devices;
+	status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);	
+	if (numDevices == 0)	//no GPU available.
+	{
+		cout << "No GPU device available." << endl;
+		cout << "Choose CPU as default device." << endl;
+		status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 0, NULL, &numDevices);	
+		devices = (cl_device_id*)malloc(numDevices * sizeof(cl_device_id));
+		status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, numDevices, devices, NULL);
+	}
+	else
+	{
+		devices = (cl_device_id*)malloc(numDevices * sizeof(cl_device_id));
+		status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numDevices, devices, NULL);
+	}
+	
+
+	/*Step 3: Create context.*/
+	cl_context context = clCreateContext(NULL,1, devices,NULL,NULL,NULL);
+	
+	/*Step 4: Creating command queue associate with the context.*/
+	cl_command_queue commandQueue = clCreateCommandQueue(context, devices[0], 0, NULL);
+
+	/*Step 5: Create program object */
+	const char *filename = "HelloWorld_Kernel.cl";
+	string sourceStr;
+	status = convertToString(filename, sourceStr);
+	const char *source = sourceStr.c_str();
+	size_t sourceSize[] = {strlen(source)};
+	cl_program program = clCreateProgramWithSource(context, 1, &source, sourceSize, NULL);
+	
+	/*Step 6: Build program. */
+	status=clBuildProgram(program, 1,devices,NULL,NULL,NULL);
+
+	/*Step 7: Initial input,output for the host and create memory objects for the kernel*/
+	const char* input = "GdkknVnqkc";
+	size_t strlength = strlen(input);
+	cout << "input string:" << endl;
+	cout << input << endl;
+	char *output = (char*) malloc(strlength + 1);
+
+	cl_mem inputBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, (strlength + 1) * sizeof(char),(void *) input, NULL);
+	cl_mem outputBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY , (strlength + 1) * sizeof(char), NULL, NULL);
+
+	/*Step 8: Create kernel object */
+	cl_kernel kernel = clCreateKernel(program,"helloworld", NULL);
+
+	/*Step 9: Sets Kernel arguments.*/
+	status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&inputBuffer);
+	status = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&outputBuffer);
+	
+	/*Step 10: Running the kernel.*/
+	size_t global_work_size[1] = {strlength};
+	status = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, global_work_size, NULL, 0, NULL, NULL);
+
+	/*Step 11: Read the cout put back to host memory.*/
+	status = clEnqueueReadBuffer(commandQueue, outputBuffer, CL_TRUE, 0, strlength * sizeof(char), output, 0, NULL, NULL);
+	
+	output[strlength] = '\0';	//Add the terminal character to the end of output.
+	cout << "\noutput string:" << endl;
+	cout << output << endl;
+
+	/*Step 12: Clean the resources.*/
+	status = clReleaseKernel(kernel);				//Release kernel.
+	status = clReleaseProgram(program);				//Release the program object.
+	status = clReleaseMemObject(inputBuffer);		//Release mem object.
+	status = clReleaseMemObject(outputBuffer);
+	status = clReleaseCommandQueue(commandQueue);	//Release  Command queue.
+	status = clReleaseContext(context);				//Release context.
+
+	if (output != NULL)
+	{
+		free(output);
+		output = NULL;
+	}
+
+	if (devices != NULL)
+	{
+		free(devices);
+		devices = NULL;
+	}
+
+	std::cout<<"Passed!\n";
+	return SUCCESS;
+}
\ No newline at end of file
diff --git a/External/OpenCL/tests/AMDAPP_SDK/HelloWorld/HelloWorld.reference_output b/External/OpenCL/tests/AMDAPP_SDK/HelloWorld/HelloWorld.reference_output
new file mode 100644
index 0000000000..8d8b5fa0d1
--- /dev/null
+++ b/External/OpenCL/tests/AMDAPP_SDK/HelloWorld/HelloWorld.reference_output
@@ -0,0 +1,6 @@
+input string:
+GdkknVnqkc
+
+output string:
+ÿÿÿÿÿÿÿÿÿÿ
+Passed!
diff --git a/External/OpenCL/tests/AMDAPP_SDK/HelloWorld/HelloWorld_Kernel.cl b/External/OpenCL/tests/AMDAPP_SDK/HelloWorld/HelloWorld_Kernel.cl
new file mode 100644
index 0000000000..b4abeadf83
--- /dev/null
+++ b/External/OpenCL/tests/AMDAPP_SDK/HelloWorld/HelloWorld_Kernel.cl
@@ -0,0 +1,171 @@
+/**********************************************************************
+Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+•	Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+•	Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or
+ other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+********************************************************************/
+
+// For clarity,error checking has been omitted.
+
+#include <CL/cl.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <iostream>
+#include <string>
+#include <fstream>
+
+#define SUCCESS 0
+#define FAILURE 1
+
+using namespace std;
+
+/* convert the kernel file into a string */
+int convertToString(const char *filename, std::string& s)
+{
+	size_t size;
+	char*  str;
+	std::fstream f(filename, (std::fstream::in | std::fstream::binary));
+
+	if(f.is_open())
+	{
+		size_t fileSize;
+		f.seekg(0, std::fstream::end);
+		size = fileSize = (size_t)f.tellg();
+		f.seekg(0, std::fstream::beg);
+		str = new char[size+1];
+		if(!str)
+		{
+			f.close();
+			return 0;
+		}
+
+		f.read(str, fileSize);
+		f.close();
+		str[size] = '\0';
+		s = str;
+		delete[] str;
+		return 0;
+	}
+	cout<<"Error: failed to open file\n:"<<filename<<endl;
+	return FAILURE;
+}
+
+int main(int argc, char* argv[])
+{
+
+	/*Step1: Getting platforms and choose an available one.*/
+	cl_uint numPlatforms;	//the NO. of platforms
+	cl_platform_id platform = NULL;	//the chosen platform
+	cl_int	status = clGetPlatformIDs(0, NULL, &numPlatforms);
+	if (status != CL_SUCCESS)
+	{
+		cout << "Error: Getting platforms!" << endl;
+		return FAILURE;
+	}
+
+	/*For clarity, choose the first available platform. */
+	if(numPlatforms > 0)
+	{
+		cl_platform_id* platforms = (cl_platform_id* )malloc(numPlatforms* sizeof(cl_platform_id));
+		status = clGetPlatformIDs(numPlatforms, platforms, NULL);
+		platform = platforms[0];
+		free(platforms);
+	}
+
+	/*Step 2:Query the platform and choose the first GPU device if has one.Otherwise use the CPU as device.*/
+	cl_uint				numDevices = 0;
+	cl_device_id        *devices;
+	status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);	
+	if (numDevices == 0)	//no GPU available.
+	{
+		cout << "No GPU device available." << endl;
+		cout << "Choose CPU as default device." << endl;
+		status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 0, NULL, &numDevices);	
+		devices = (cl_device_id*)malloc(numDevices * sizeof(cl_device_id));
+		status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, numDevices, devices, NULL);
+	}
+	else
+	{
+		devices = (cl_device_id*)malloc(numDevices * sizeof(cl_device_id));
+		status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numDevices, devices, NULL);
+	}
+	
+
+	/*Step 3: Create context.*/
+	cl_context context = clCreateContext(NULL,1, devices,NULL,NULL,NULL);
+	
+	/*Step 4: Creating command queue associate with the context.*/
+	cl_command_queue commandQueue = clCreateCommandQueue(context, devices[0], 0, NULL);
+
+	/*Step 5: Create program object */
+	const char *filename = "HelloWorld_Kernel.cl";
+	string sourceStr;
+	status = convertToString(filename, sourceStr);
+	const char *source = sourceStr.c_str();
+	size_t sourceSize[] = {strlen(source)};
+	cl_program program = clCreateProgramWithSource(context, 1, &source, sourceSize, NULL);
+	
+	/*Step 6: Build program. */
+	status=clBuildProgram(program, 1,devices,NULL,NULL,NULL);
+
+	/*Step 7: Initial input,output for the host and create memory objects for the kernel*/
+	const char* input = "GdkknVnqkc";
+	size_t strlength = strlen(input);
+	cout << "input string:" << endl;
+	cout << input << endl;
+	char *output = (char*) malloc(strlength + 1);
+
+	cl_mem inputBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, (strlength + 1) * sizeof(char),(void *) input, NULL);
+	cl_mem outputBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY , (strlength + 1) * sizeof(char), NULL, NULL);
+
+	/*Step 8: Create kernel object */
+	cl_kernel kernel = clCreateKernel(program,"helloworld", NULL);
+
+	/*Step 9: Sets Kernel arguments.*/
+	status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&inputBuffer);
+	status = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&outputBuffer);
+	
+	/*Step 10: Running the kernel.*/
+	size_t global_work_size[1] = {strlength};
+	status = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, global_work_size, NULL, 0, NULL, NULL);
+
+	/*Step 11: Read the cout put back to host memory.*/
+	status = clEnqueueReadBuffer(commandQueue, outputBuffer, CL_TRUE, 0, strlength * sizeof(char), output, 0, NULL, NULL);
+	
+	output[strlength] = '\0';	//Add the terminal character to the end of output.
+	cout << "\noutput string:" << endl;
+	cout << output << endl;
+
+	/*Step 12: Clean the resources.*/
+	status = clReleaseKernel(kernel);				//Release kernel.
+	status = clReleaseProgram(program);				//Release the program object.
+	status = clReleaseMemObject(inputBuffer);		//Release mem object.
+	status = clReleaseMemObject(outputBuffer);
+	status = clReleaseCommandQueue(commandQueue);	//Release  Command queue.
+	status = clReleaseContext(context);				//Release context.
+
+	if (output != NULL)
+	{
+		free(output);
+		output = NULL;
+	}
+
+	if (devices != NULL)
+	{
+		free(devices);
+		devices = NULL;
+	}
+
+	std::cout<<"Passed!\n";
+	return SUCCESS;
+}
\ No newline at end of file
diff --git a/External/OpenCL/tests/AMDAPP_SDK/HelloWorld/doc/Hello_World.pdf b/External/OpenCL/tests/AMDAPP_SDK/HelloWorld/doc/Hello_World.pdf
new file mode 100644
index 0000000000..56ea6b199d
Binary files /dev/null and b/External/OpenCL/tests/AMDAPP_SDK/HelloWorld/doc/Hello_World.pdf differ
diff --git a/External/OpenCL/tests/AMDAPP_SDK/Template/CMakeLists.txt b/External/OpenCL/tests/AMDAPP_SDK/Template/CMakeLists.txt
new file mode 100644
index 0000000000..369465532a
--- /dev/null
+++ b/External/OpenCL/tests/AMDAPP_SDK/Template/CMakeLists.txt
@@ -0,0 +1,25 @@
+# Test parameters
+set(TestName Template)
+set(TestSources Template.cpp)
+set(TestData Template_Kernels.cl)
+set(OpenCL_CPPFLAGS "")
+set(OpenCL_LDFLAGS "")
+set(OpenCL_LIBS "")
+
+# Target to build this specific test.
+add_custom_target(opencl-tests-simple-${TestName})
+
+# Add dependencies for target to build all the tests in opencl-tests-simple. 
+add_dependencies(opencl-tests-simple opencl-tests-simple-${TestName})
+create_local_test(${TestName} "${TestSources}" "${TestData}" "${OpenCL_CPPFLAGS}" "${OpenCL_LDFLAGS}" "${OpenCL_LIBS}")
+
+# Populate & propogate list of tests to run them all using check-opencl-simple.
+list(APPEND OPENCL_SIMPLE_TESTS_LIST "${CMAKE_CURRENT_BINARY_DIR}/${TestName}.test")
+set(OPENCL_SIMPLE_TESTS_LIST ${OPENCL_SIMPLE_TESTS_LIST} PARENT_SCOPE)
+
+# Target to run this specific test.
+add_custom_target(check-opencl-simple-${TestName}
+  COMMAND ${TEST_SUITE_LIT} ${TEST_SUITE_LIT_FLAGS} ${TestName}.test
+  WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+  DEPENDS opencl-tests-simple-${TestName}
+  USES_TERMINAL)
diff --git a/External/OpenCL/tests/AMDAPP_SDK/Template/Template.cpp b/External/OpenCL/tests/AMDAPP_SDK/Template/Template.cpp
new file mode 100644
index 0000000000..ccd3b139bc
--- /dev/null
+++ b/External/OpenCL/tests/AMDAPP_SDK/Template/Template.cpp
@@ -0,0 +1,680 @@
+/**********************************************************************
+Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+•	Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+•	Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or
+ other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+********************************************************************/
+
+
+#include "Template.hpp"
+
+/*
+ * \brief Host Initialization 
+ *        Allocate and initialize memory 
+ *        on the host. Print input array. 
+ */
+int
+initializeHost(void)
+{
+    width               = 256;
+    input               = NULL;
+    output              = NULL;
+    multiplier          = 2;
+
+    /////////////////////////////////////////////////////////////////
+    // Allocate and initialize memory used by host 
+    /////////////////////////////////////////////////////////////////
+    cl_uint sizeInBytes = width * sizeof(cl_uint);
+    input = (cl_uint *) malloc(sizeInBytes);
+    if(!input)
+    {
+        std::cout << "Error: Failed to allocate input memory on host\n";
+        return SDK_FAILURE;
+    }
+
+    output = (cl_uint *) malloc(sizeInBytes);
+    if(!output)
+    {
+        std::cout << "Error: Failed to allocate input memory on host\n";
+        return SDK_FAILURE;
+    }
+
+    for(cl_uint i = 0; i < width; i++)
+        input[i] = i;
+
+    // print input array
+    print1DArray(std::string("Input").c_str(), input, width);
+    return SDK_SUCCESS;
+}
+
+/*
+ * Converts the contents of a file into a string
+ */
+std::string
+convertToString(const char *filename)
+{
+    size_t size;
+    char*  str;
+    std::string s;
+
+    std::fstream f(filename, (std::fstream::in | std::fstream::binary));
+
+    if(f.is_open())
+    {
+        size_t fileSize;
+        f.seekg(0, std::fstream::end);
+        size = fileSize = (size_t)f.tellg();
+        f.seekg(0, std::fstream::beg);
+
+        str = new char[size+1];
+        if(!str)
+        {
+            f.close();
+            std::cout << "Memory allocation failed";
+            return NULL;
+        }
+
+        f.read(str, fileSize);
+        f.close();
+        str[size] = '\0';
+    
+        s = str;
+        delete[] str;
+        return s;
+    }
+    else
+    {
+        std::cout << "\nFile containg the kernel code(\".cl\") not found. Please copy the required file in the folder containg the executable.\n";
+        exit(1);
+    }
+    return NULL;
+}
+
+/*
+ * \brief OpenCL related initialization 
+ *        Create Context, Device list, Command Queue
+ *        Create OpenCL memory buffer objects
+ *        Load CL file, compile, link CL source 
+ *		  Build program and kernel objects
+ */
+int
+initializeCL(void)
+{
+    cl_int status = 0;
+    size_t deviceListSize;
+
+    //////////////////////////////////////////////////////////////////// 
+    // STEP 1 Getting Platform.
+    //////////////////////////////////////////////////////////////////// 
+    
+    /*
+     * Have a look at the available platforms and pick either
+     * the AMD one if available or a reasonable default.
+     */
+
+    cl_uint numPlatforms;
+    cl_platform_id platform = NULL;
+    status = clGetPlatformIDs(0, NULL, &numPlatforms);
+    if(status != CL_SUCCESS)
+    {
+        std::cout << "Error: Getting Platforms. (clGetPlatformsIDs)\n";
+        return SDK_FAILURE;
+    }
+    
+    if(numPlatforms > 0)
+    {
+        cl_platform_id* platforms = new cl_platform_id[numPlatforms];
+        status = clGetPlatformIDs(numPlatforms, platforms, NULL);
+        if(status != CL_SUCCESS)
+        {
+            std::cout << "Error: Getting Platform Ids. (clGetPlatformsIDs)\n";
+            return SDK_FAILURE;
+        }
+        for(unsigned int i=0; i < numPlatforms; ++i)
+        {
+            char pbuff[100];
+            status = clGetPlatformInfo(
+                        platforms[i],
+                        CL_PLATFORM_VENDOR,
+                        sizeof(pbuff),
+                        pbuff,
+                        NULL);
+            if(status != CL_SUCCESS)
+            {
+                std::cout << "Error: Getting Platform Info.(clGetPlatformInfo)\n";
+                return SDK_FAILURE;
+            }
+            platform = platforms[i];
+            if(!strcmp(pbuff, "Advanced Micro Devices, Inc."))
+            {
+                break;
+            }
+        }
+        delete platforms;
+    }
+
+    if(NULL == platform)
+    {
+        std::cout << "NULL platform found so Exiting Application." << std::endl;
+        return SDK_FAILURE;
+    }
+
+
+    //////////////////////////////////////////////////////////////////// 
+    // STEP 2 Creating context using the platform selected
+    //      Context created from type includes all available
+    //      devices of the specified type from the selected platform 
+    //////////////////////////////////////////////////////////////////// 
+    
+
+    /*
+     * If we could find our platform, use it. Otherwise use just available platform.
+     */
+    cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 };
+
+    context = clCreateContextFromType(cps, 
+                                      CL_DEVICE_TYPE_GPU, 
+                                      NULL, 
+                                      NULL, 
+                                      &status);
+    if(status != CL_SUCCESS) 
+    {  
+        std::cout << "Error: Creating Context. (clCreateContextFromType)\n";
+        return SDK_FAILURE; 
+    }
+
+
+    //////////////////////////////////////////////////////////////////// 
+    // STEP 3
+    //      3.1 Query context for the device list size,
+    //      3.2 Allocate that much memory using malloc or new
+    //      3.3 Again query context info to get the array of device
+    //              available in the created context
+    //////////////////////////////////////////////////////////////////// 
+    
+    // First, get the size of device list data
+    status = clGetContextInfo(context, 
+                              CL_CONTEXT_DEVICES, 
+                              0, 
+                              NULL, 
+                              &deviceListSize);
+    if(status != CL_SUCCESS) 
+    {  
+        std::cout <<
+            "Error: Getting Context Info \
+            (device list size, clGetContextInfo)\n";
+        return SDK_FAILURE;
+    }
+
+    devices = (cl_device_id *)malloc(deviceListSize);
+    if(devices == 0)
+    {
+        std::cout << "Error: No devices found.\n";
+        return SDK_FAILURE;
+    }
+
+    // Now, get the device list data
+    status = clGetContextInfo(
+                 context, 
+                 CL_CONTEXT_DEVICES, 
+                 deviceListSize, 
+                 devices, 
+                 NULL);
+    if(status != CL_SUCCESS) 
+    { 
+        std::cout <<
+            "Error: Getting Context Info \
+            (device list, clGetContextInfo)\n";
+        return SDK_FAILURE;
+    }
+
+    //////////////////////////////////////////////////////////////////// 
+    // STEP 4 Creating command queue for a single device
+    //      Each device in the context can have a 
+    //      dedicated commandqueue object for itself
+    //////////////////////////////////////////////////////////////////// 
+    
+    commandQueue = clCreateCommandQueue(
+                       context, 
+                       devices[0], 
+                       0, 
+                       &status);
+    if(status != CL_SUCCESS) 
+    { 
+        std::cout << "Creating Command Queue. (clCreateCommandQueue)\n";
+        return SDK_FAILURE;
+    }
+
+    /////////////////////////////////////////////////////////////////
+    // STEP 5 Creating cl_buffer objects from host buffer
+    //          These buffer objects can be passed to the kernel
+    //          as kernel arguments
+    /////////////////////////////////////////////////////////////////
+    inputBuffer = clCreateBuffer(
+                      context, 
+                      CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
+                      sizeof(cl_uint) * width,
+                      input, 
+                      &status);
+    if(status != CL_SUCCESS) 
+    { 
+        std::cout << "Error: clCreateBuffer (inputBuffer)\n";
+        return SDK_FAILURE;
+    }
+
+    outputBuffer = clCreateBuffer(
+                       context, 
+                       CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
+                       sizeof(cl_uint) * width,
+                       output, 
+                       &status);
+    if(status != CL_SUCCESS) 
+    { 
+        std::cout << "Error: clCreateBuffer (outputBuffer)\n";
+        return SDK_FAILURE;
+    }
+
+
+    /////////////////////////////////////////////////////////////////
+    // STEP 6. Building Kernel
+    //      6.1 Load CL file, using basic file i/o
+    //      6.2 Build CL program object
+    //      6.3 Create CL kernel object
+    /////////////////////////////////////////////////////////////////
+    const char * filename  = "Template_Kernels.cl";
+    std::string  sourceStr = convertToString(filename);
+    const char * source    = sourceStr.c_str();
+    size_t sourceSize[]    = { strlen(source) };
+
+    program = clCreateProgramWithSource(
+                  context, 
+                  1, 
+                  &source,
+                  sourceSize,
+                  &status);
+    if(status != CL_SUCCESS) 
+    { 
+      std::cout <<
+               "Error: Loading Binary into cl_program \
+               (clCreateProgramWithBinary)\n";
+      return SDK_FAILURE;
+    }
+
+    // create a cl program executable for all the devices specified
+    status = clBuildProgram(program, 1, devices, NULL, NULL, NULL);
+    if(status != CL_SUCCESS) 
+    { 
+        std::cout << "Error: Building Program (clBuildProgram)\n";
+        return SDK_FAILURE; 
+    }
+
+    // get a kernel object handle for a kernel with the given name
+    kernel = clCreateKernel(program, "templateKernel", &status);
+    if(status != CL_SUCCESS) 
+    {  
+        std::cout << "Error: Creating Kernel from program. (clCreateKernel)\n";
+        return SDK_FAILURE;
+    }
+
+    return SDK_SUCCESS;
+}
+
+
+/*
+ * \brief Run OpenCL program 
+ *
+ *        Bind host variables to kernel arguments 
+ *        Run the CL kernel
+ */
+int 
+runCLKernels(void)
+{
+    cl_int   status;
+    cl_uint maxDims;
+    cl_event events[2];
+    size_t globalThreads[1];
+    size_t localThreads[1];
+    size_t maxWorkGroupSize;
+    size_t maxWorkItemSizes[3];
+
+    //////////////////////////////////////////////////////////////////// 
+    // STEP 7 Analyzing proper workgroup size for the kernel
+    //          by querying device information
+    //    7.1 Device Info CL_DEVICE_MAX_WORK_GROUP_SIZE
+    //    7.2 Device Info CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS
+    //    7.3 Device Info CL_DEVICE_MAX_WORK_ITEM_SIZES
+    //////////////////////////////////////////////////////////////////// 
+    
+    
+    /**
+    * Query device capabilities. Maximum 
+    * work item dimensions and the maximmum
+    * work item sizes
+    */ 
+    status = clGetDeviceInfo(
+        devices[0], 
+        CL_DEVICE_MAX_WORK_GROUP_SIZE, 
+        sizeof(size_t), 
+        (void*)&maxWorkGroupSize, 
+        NULL);
+    if(status != CL_SUCCESS) 
+    {  
+        std::cout << "Error: Getting Device Info. (clGetDeviceInfo)\n";
+        return SDK_FAILURE;
+    }
+    
+    status = clGetDeviceInfo(
+        devices[0], 
+        CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, 
+        sizeof(cl_uint), 
+        (void*)&maxDims, 
+        NULL);
+    if(status != CL_SUCCESS) 
+    {  
+        std::cout << "Error: Getting Device Info. (clGetDeviceInfo)\n";
+        return SDK_FAILURE;
+    }
+
+    status = clGetDeviceInfo(
+        devices[0], 
+        CL_DEVICE_MAX_WORK_ITEM_SIZES, 
+        sizeof(size_t)*maxDims,
+        (void*)maxWorkItemSizes,
+        NULL);
+    if(status != CL_SUCCESS) 
+    {  
+        std::cout << "Error: Getting Device Info. (clGetDeviceInfo)\n";
+        return SDK_FAILURE;
+    }
+    
+    globalThreads[0] = width;
+    localThreads[0]  = 1;
+
+    if(localThreads[0] > maxWorkGroupSize ||
+        localThreads[0] > maxWorkItemSizes[0])
+    {
+        std::cout << "Unsupported: Device does not support requested number of work items.";
+        return SDK_FAILURE;
+    }
+
+    //////////////////////////////////////////////////////////////////// 
+    // STEP 8 Set appropriate arguments to the kernel
+    //    8.1 Kernel Arg outputBuffer ( cl_mem object)
+    //    8.2 Kernel Arg inputBuffer (cl_mem object)
+    //    8.3 Kernel Arg multiplier (cl_uint)
+    //////////////////////////////////////////////////////////////////// 
+    
+    // the output array to the kernel
+    status = clSetKernelArg(
+                    kernel, 
+                    0, 
+                    sizeof(cl_mem), 
+                    (void *)&outputBuffer);
+    if(status != CL_SUCCESS) 
+    { 
+        std::cout << "Error: Setting kernel argument. (output)\n";
+        return SDK_FAILURE;
+    }
+
+    // the input array to the kernel
+    status = clSetKernelArg(
+                    kernel, 
+                    1, 
+                    sizeof(cl_mem), 
+                    (void *)&inputBuffer);
+    if(status != CL_SUCCESS) 
+    { 
+        std::cout << "Error: Setting kernel argument. (input)\n";
+        return SDK_FAILURE;
+    }
+
+    // multiplier
+    status = clSetKernelArg(
+                    kernel, 
+                    2, 
+                    sizeof(cl_uint), 
+                    (void *)&multiplier);
+    if(status != CL_SUCCESS) 
+    { 
+        std::cout << "Error: Setting kernel argument. (multiplier)\n";
+        return SDK_FAILURE;
+    }
+
+    //////////////////////////////////////////////////////////////////// 
+    // STEP 9 Enqueue a kernel run call.
+    //          Wait till the event completes and release the event
+    //////////////////////////////////////////////////////////////////// 
+    status = clEnqueueNDRangeKernel(
+                 commandQueue,
+                 kernel,
+                 1,
+                 NULL,
+                 globalThreads,
+                 localThreads,
+                 0,
+                 NULL,
+                 &events[0]);
+    if(status != CL_SUCCESS) 
+    { 
+        std::cout <<
+            "Error: Enqueueing kernel onto command queue. \
+            (clEnqueueNDRangeKernel)\n";
+        return SDK_FAILURE;
+    }
+
+
+    // wait for the kernel call to finish execution
+    status = clWaitForEvents(1, &events[0]);
+    if(status != CL_SUCCESS) 
+    { 
+        std::cout <<
+            "Error: Waiting for kernel run to finish. \
+            (clWaitForEvents)\n";
+        return SDK_FAILURE;
+    }
+
+    status = clReleaseEvent(events[0]);
+    if(status != CL_SUCCESS) 
+    { 
+        std::cout <<
+            "Error: Release event object. \
+            (clReleaseEvent)\n";
+        return SDK_FAILURE;
+    }
+
+    //////////////////////////////////////////////////////////////////// 
+    // STEP 10  Enqueue readBuffer to read the output back
+    //  Wait for the event and release the event
+    //////////////////////////////////////////////////////////////////// 
+    status = clEnqueueReadBuffer(
+                commandQueue,
+                outputBuffer,
+                CL_TRUE,
+                0,
+                width * sizeof(cl_uint),
+                output,
+                0,
+                NULL,
+                &events[1]);
+    
+    if(status != CL_SUCCESS) 
+    { 
+        std::cout << 
+            "Error: clEnqueueReadBuffer failed. \
+             (clEnqueueReadBuffer)\n";
+        return SDK_FAILURE;
+    }
+    
+    // Wait for the read buffer to finish execution
+    status = clWaitForEvents(1, &events[1]);
+    if(status != CL_SUCCESS) 
+    { 
+        std::cout <<
+            "Error: Waiting for read buffer call to finish. \
+            (clWaitForEvents)\n";
+        return SDK_FAILURE;
+    }
+    
+    status = clReleaseEvent(events[1]);
+    if(status != CL_SUCCESS) 
+    { 
+        std::cout <<
+            "Error: Release event object. \
+            (clReleaseEvent)\n";
+        return SDK_FAILURE;
+    }
+    return SDK_SUCCESS;
+}
+
+
+/*
+ * \brief Release OpenCL resources (Context, Memory etc.) 
+ */
+int  
+cleanupCL(void)
+{
+    cl_int status;
+
+    //////////////////////////////////////////////////////////////////// 
+    // STEP 11  CLean up the opencl resources used 
+    //////////////////////////////////////////////////////////////////// 
+    
+    status = clReleaseKernel(kernel);
+    if(status != CL_SUCCESS)
+    {
+        std::cout << "Error: In clReleaseKernel \n";
+        return SDK_FAILURE; 
+    }
+    status = clReleaseProgram(program);
+    if(status != CL_SUCCESS)
+    {
+        std::cout << "Error: In clReleaseProgram\n";
+        return SDK_FAILURE; 
+    }
+    status = clReleaseMemObject(inputBuffer);
+    if(status != CL_SUCCESS)
+    {
+        std::cout << "Error: In clReleaseMemObject (inputBuffer)\n";
+        return SDK_FAILURE; 
+    }
+    status = clReleaseMemObject(outputBuffer);
+    if(status != CL_SUCCESS)
+    {
+        std::cout << "Error: In clReleaseMemObject (outputBuffer)\n";
+        return SDK_FAILURE; 
+    }
+    status = clReleaseCommandQueue(commandQueue);
+    if(status != CL_SUCCESS)
+    {
+        std::cout << "Error: In clReleaseCommandQueue\n";
+        return SDK_FAILURE;
+    }
+    status = clReleaseContext(context);
+    if(status != CL_SUCCESS)
+    {
+        std::cout << "Error: In clReleaseContext\n";
+        return SDK_FAILURE;
+    }
+    return SDK_SUCCESS;
+}
+
+
+/* 
+ * \brief Releases program's resources 
+ */
+void
+cleanupHost(void)
+{
+    if(input != NULL)
+    {
+        free(input);
+        input = NULL;
+    }
+    if(output != NULL)
+    {
+        free(output);
+        output = NULL;
+    }
+    if(devices != NULL)
+    {
+        free(devices);
+        devices = NULL;
+    }
+}
+
+
+/*
+ * \brief Print no more than 256 elements of the given array.
+ *
+ *        Print Array name followed by elements.
+ */
+void print1DArray(
+         const std::string arrayName, 
+         const unsigned int * arrayData, 
+         const unsigned int length)
+{
+    cl_uint i;
+    cl_uint numElementsToPrint = (256 < length) ? 256 : length;
+
+    std::cout << std::endl;
+    std::cout << arrayName << ":" << std::endl;
+    for(i = 0; i < numElementsToPrint; ++i)
+    {
+        std::cout << arrayData[i] << " ";
+    }
+    std::cout << std::endl;
+
+}
+
+void verify()
+{
+    bool passed = true;
+    for(unsigned int i = 0; i < width; ++i)
+        if(input[i] * multiplier != output[i])
+            passed = false;
+
+    if(passed == true)
+        std::cout << "Passed!\n" << std::endl;
+    else
+        std::cout << "Failed!\n" << std::endl;
+}
+
+int 
+main(int argc, char * argv[])
+{
+    // Initialize Host application 
+    if(initializeHost() != SDK_SUCCESS)
+        return SDK_FAILURE;
+
+    // Initialize OpenCL resources
+    if(initializeCL() != SDK_SUCCESS)
+        return SDK_FAILURE;
+
+    // Run the CL program
+    if(runCLKernels() != SDK_SUCCESS)
+        return SDK_FAILURE;
+
+    // Print output array
+    print1DArray(std::string("Output"), output, width);
+
+    // Verify output
+    verify();
+
+    // Releases OpenCL resources 
+    if(cleanupCL()!= SDK_SUCCESS)
+        return SDK_FAILURE;
+
+    // Release host resources
+    cleanupHost();
+
+    return SDK_SUCCESS;
+}
diff --git a/External/OpenCL/tests/AMDAPP_SDK/Template/Template.hpp b/External/OpenCL/tests/AMDAPP_SDK/Template/Template.hpp
new file mode 100644
index 0000000000..54749d4dcd
--- /dev/null
+++ b/External/OpenCL/tests/AMDAPP_SDK/Template/Template.hpp
@@ -0,0 +1,119 @@
+/**********************************************************************
+Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+•	Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+•	Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or
+ other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+********************************************************************/
+
+
+#ifndef TEMPLATE_H_
+#define TEMPLATE_H_
+
+
+
+
+#include <CL/cl.h>
+#include <string.h>
+#include <cstdlib>
+#include <iostream>
+#include <string>
+#include <fstream>
+
+
+// GLOBALS 
+#define SDK_SUCCESS 0
+#define SDK_FAILURE 1
+
+/*
+ * Input data is stored here.
+ */
+cl_uint *input;
+
+/*
+ * Output data is stored here.
+ */
+cl_uint *output;
+
+/*
+ * Multiplier is stored in this variable 
+ */
+cl_uint multiplier;
+
+/* problem size for 1D algorithm and width of problem size for 2D algorithm */
+cl_uint width;
+
+/* The memory buffer that is used as input/output for OpenCL kernel */
+cl_mem   inputBuffer;
+cl_mem	 outputBuffer;
+
+cl_context          context;
+cl_device_id        *devices;
+cl_command_queue    commandQueue;
+
+cl_program program;
+
+/* This program uses only one kernel and this serves as a handle to it */
+cl_kernel  kernel;
+
+
+// FUNCTION DECLARATIONS 
+
+/*
+ * OpenCL related initialisations are done here.
+ * Context, Device list, Command Queue are set up.
+ * Calls are made to set up OpenCL memory buffers that this program uses
+ * and to load the programs into memory and get kernel handles.
+ */
+int initializeCL(void);
+
+/*
+ * Convert char* to string
+ */
+std::string convertToString(const char * filename);
+
+/*
+ * This is called once the OpenCL context, memory etc. are set up,
+ * the program is loaded into memory and the kernel handles are ready.
+ * 
+ * It sets the values for kernels' arguments and enqueues calls to the kernels
+ * on to the command queue and waits till the calls have finished execution.
+ *
+ * It also gets kernel start and end time if profiling is enabled.
+ * @return returns SDK_SUCCESS on success and SDK_FAILURE otherwise
+ */
+int runCLKernels(void);
+
+/**
+ * Releases OpenCL resources (Context, Memory etc.) 
+ * @return returns SDK_SUCCESS on success and SDK_FAILURE otherwise
+ */
+int cleanupCL(void);
+
+/**
+ * Releases program's resources
+ * @return returns SDK_SUCCESS on success and SDK_FAILURE otherwise 
+ */
+void cleanupHost(void);
+
+/*
+ * Prints no more than 256 elements of the given array.
+ * Prints full array if length is less than 256.
+ *
+ * Prints Array name followed by elements.
+ */
+void print1DArray(
+         const std::string arrayName, 
+         const unsigned int * arrayData, 
+         const unsigned int length);
+
+
+#endif  /* #ifndef TEMPLATE_H_ */
diff --git a/External/OpenCL/tests/AMDAPP_SDK/Template/Template.reference_output b/External/OpenCL/tests/AMDAPP_SDK/Template/Template.reference_output
new file mode 100644
index 0000000000..9bfabcf9e1
--- /dev/null
+++ b/External/OpenCL/tests/AMDAPP_SDK/Template/Template.reference_output
@@ -0,0 +1,8 @@
+
+Input:
+0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 
+
+Output:
+0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30 32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62 64 66 68 70 72 74 76 78 80 82 84 86 88 90 92 94 96 98 100 102 104 106 108 110 112 114 116 118 120 122 124 126 128 130 132 134 136 138 140 142 144 146 148 150 152 154 156 158 160 162 164 166 168 170 172 174 176 178 180 182 184 186 188 190 192 194 196 198 200 202 204 206 208 210 212 214 216 218 220 222 224 226 228 230 232 234 236 238 240 242 244 246 248 250 252 254 256 258 260 262 264 266 268 270 272 274 276 278 280 282 284 286 288 290 292 294 296 298 300 302 304 306 308 310 312 314 316 318 320 322 324 326 328 330 332 334 336 338 340 342 344 346 348 350 352 354 356 358 360 362 364 366 368 370 372 374 376 378 380 382 384 386 388 390 392 394 396 398 400 402 404 406 408 410 412 414 416 418 420 422 424 426 428 430 432 434 436 438 440 442 444 446 448 450 452 454 456 458 460 462 464 466 468 470 472 474 476 478 480 482 484 486 488 490 492 494 496 498 500 502 504 506 508 510 
+Passed!
+
diff --git a/External/OpenCL/tests/AMDAPP_SDK/Template/Template_Kernels.cl b/External/OpenCL/tests/AMDAPP_SDK/Template/Template_Kernels.cl
new file mode 100644
index 0000000000..62dcf6aff0
--- /dev/null
+++ b/External/OpenCL/tests/AMDAPP_SDK/Template/Template_Kernels.cl
@@ -0,0 +1,30 @@
+/**********************************************************************
+Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+•	Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+•	Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or
+ other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+********************************************************************/
+
+/*!
+ * Sample kernel which multiplies every element of the input array with
+ * a constant and stores it at the corresponding output array
+ */
+
+
+__kernel void templateKernel(__global  unsigned int * output,
+                             __global  unsigned int * input,
+                             const     unsigned int multiplier)
+{
+    uint tid = get_global_id(0);
+    
+    output[tid] = input[tid] * multiplier;
+}
diff --git a/External/OpenCL/tests/AMDAPP_SDK/Template/docs/Template.pdf b/External/OpenCL/tests/AMDAPP_SDK/Template/docs/Template.pdf
new file mode 100644
index 0000000000..7e3a0fe45c
Binary files /dev/null and b/External/OpenCL/tests/AMDAPP_SDK/Template/docs/Template.pdf differ
diff --git a/External/OpenCL/tests/CMakeLists.txt b/External/OpenCL/tests/CMakeLists.txt
new file mode 100644
index 0000000000..0b55c2489a
--- /dev/null
+++ b/External/OpenCL/tests/CMakeLists.txt
@@ -0,0 +1,4 @@
+add_subdirectory(AMDAPP_SDK)
+
+# Propogate the list of tests to run.
+set(OPENCL_SIMPLE_TESTS_LIST ${OPENCL_SIMPLE_TESTS_LIST} PARENT_SCOPE)