Skip to content

Commit 14fbe8d

Browse files
authored
Merge pull request #19 from codereport/tinyapl
✨ bs for matx
2 parents fd63181 + a9d8a4f commit 14fbe8d

File tree

2 files changed

+151
-3
lines changed

2 files changed

+151
-3
lines changed

code/matx/CMakeLists.txt

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ FetchContent_Declare(matx
1111
GIT_TAG main
1212
)
1313

14-
FetchContent_GetProperties(arrayfire)
14+
FetchContent_GetProperties(matx)
1515
if(NOT matx_POPULATED)
1616
FetchContent_Populate(matx)
1717
add_subdirectory(${matx_SOURCE_DIR} ${matx_BINARY_DIR} EXCLUDE_FROM_ALL)
@@ -27,8 +27,18 @@ if (NOT CMAKE_CXX_COMPILER)
2727
set(CMAKE_CXX_COMPILER "/usr/bin/g++")
2828
endif()
2929

30-
set(CMAKE_CUDA_COMPILER /usr/local/cuda/bin/nvcc)
30+
set(CMAKE_CUDA_COMPILER /opt/nvidia/hpc_sdk/Linux_x86_64/24.11/compilers/bin/nvcc)
3131

32-
add_executable(test test.cu)
32+
set(CMAKE_CUDA_ARCHITECTURES 75)
33+
34+
# Optimization flags
35+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -ftree-vectorize -march=native")
36+
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3 -use_fast_math -arch=compute_75 -code=sm_75")
37+
38+
# Enable Link Time Optimization (LTO)4
39+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -flto")
40+
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -flto")
41+
42+
add_executable(test bs.cu)
3343
set_property(TARGET test PROPERTY CXX_STANDARD 20)
3444
target_link_libraries(test PRIVATE matx::matx)

code/matx/bs.cu

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
#include "matx.h"
2+
#include <cassert>
3+
#include <cstdio>
4+
#include <math.h>
5+
#include <memory>
6+
7+
using namespace matx;
8+
9+
/**
10+
* MatX uses C++ expression templates to build arithmetic expressions that compile into a lazily-evaluated
11+
* type for executing on the device. Currently, nvcc cannot see certain optimizations
12+
* when building the expression tree that would be obvious by looking at the code. Specifically any code reusing
13+
* the same tensor multiple times appears to the compiler as separate tensors, and it may issue multiple load
14+
* instructions. While caching helps, this can have a slight performance impact when compared to native CUDA
15+
* kernels. To work around this problem, complex expressions can be placed in a custom operator by adding some
16+
* boilerplate code around the original expression. This custom operator can then be used either alone or inside
17+
* other arithmetic expressions, and only a single load is issues for each tensor.
18+
*
19+
* This example uses the Black-Scholes equtation to demonstrate the two ways to implement the equation in MatX, and
20+
* shows the performance difference.
21+
*/
22+
23+
/* Custom operator */
24+
template <class O, class I1, class I2, class I3, class I4, class I5>
25+
class BlackScholes : public BaseOp<BlackScholes<O, I1, I2, I3, I4, I5>>
26+
{
27+
private:
28+
O out_;
29+
I1 K_;
30+
I2 V_;
31+
I3 S_;
32+
I4 r_;
33+
I5 T_;
34+
35+
public:
36+
BlackScholes(O out, I1 K, I2 V, I3 S, I4 r, I5 T)
37+
: out_(out), V_(V), S_(S), K_(K), r_(r), T_(T) {}
38+
39+
__device__ inline void operator()(index_t idx)
40+
{
41+
auto V = V_();
42+
auto K = K_();
43+
auto S = S_(idx);
44+
auto T = T_();
45+
auto r = r_();
46+
47+
auto VsqrtT = V * sqrt(T);
48+
auto d1 = (log(S / K) + (r + 0.5f * V * V) * T) / VsqrtT;
49+
auto d2 = d1 - VsqrtT;
50+
auto cdf_d1 = normcdff(d1);
51+
auto cdf_d2 = normcdff(d2);
52+
auto expRT = exp(-1.f * r * T);
53+
54+
out_(idx) = S * cdf_d1 - K * expRT * cdf_d2;
55+
}
56+
57+
__host__ __device__ inline index_t Size(uint32_t i) const { return out_.Size(i); }
58+
static inline constexpr __host__ __device__ int32_t Rank() { return O::Rank(); }
59+
};
60+
61+
template <typename T1>
62+
void compute_black_scholes_matx(tensor_t<T1, 0> &K,
63+
tensor_t<T1, 1> &S,
64+
tensor_t<T1, 0> &V,
65+
tensor_t<T1, 0> &r,
66+
tensor_t<T1, 0> &T,
67+
tensor_t<T1, 1> &output,
68+
cudaExecutor &exec)
69+
{
70+
auto VsqrtT = V * sqrt(T);
71+
auto d1 = (log(S / K) + (r + 0.5f * V * V) * T) / VsqrtT;
72+
auto d2 = d1 - VsqrtT;
73+
auto cdf_d1 = normcdf(d1);
74+
auto cdf_d2 = normcdf(d2);
75+
auto expRT = exp(-1.f * r * T);
76+
77+
(output = S * cdf_d1 - K * expRT * cdf_d2).run(exec);
78+
}
79+
80+
int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
81+
{
82+
MATX_ENTER_HANDLER();
83+
84+
using dtype = float;
85+
86+
index_t input_size = 100000;
87+
auto output_tensor = make_tensor<dtype>({input_size});
88+
auto S_tensor = make_tensor<dtype>({input_size});
89+
auto K_tensor = make_tensor<dtype>({});
90+
auto V_tensor = make_tensor<dtype>({});
91+
auto r_tensor = make_tensor<dtype>({});
92+
auto T_tensor = make_tensor<dtype>({});
93+
float time_ms;
94+
int num_iterations = 99;
95+
96+
for (index_t i = 0; i < input_size; i++)
97+
{
98+
S_tensor(i) = (dtype)90 + dtype(i % 20);
99+
}
100+
K_tensor() = (dtype)100.;
101+
V_tensor() = (dtype)0.1;
102+
r_tensor() = (dtype)0.05;
103+
T_tensor() = (dtype)1.0;
104+
105+
cudaStream_t stream;
106+
cudaStreamCreate(&stream);
107+
cudaExecutor exec{stream};
108+
cudaEvent_t start, stop;
109+
cudaEventCreate(&start);
110+
cudaEventCreate(&stop);
111+
112+
BlackScholes(output_tensor, K_tensor, V_tensor, S_tensor, r_tensor, T_tensor).run(exec);
113+
exec.sync();
114+
115+
cudaEventRecord(start, stream);
116+
for (int i = 0; i < num_iterations; i++)
117+
{
118+
BlackScholes(output_tensor, K_tensor, V_tensor, S_tensor, r_tensor, T_tensor).run(exec);
119+
}
120+
cudaEventRecord(stop, stream);
121+
exec.sync();
122+
cudaEventElapsedTime(&time_ms, start, stop);
123+
124+
printf("Black-Scholes time = %.2fus per iteration\n",
125+
time_ms * 1e3 / num_iterations);
126+
127+
compute_black_scholes_matx(K_tensor, S_tensor, V_tensor, r_tensor, T_tensor, output_tensor, exec);
128+
129+
printf("First 20 values of computed Black-Scholes output:\n");
130+
for (index_t i = 0; i < 20; i++)
131+
{
132+
printf("%f\n", static_cast<float>(output_tensor(i)));
133+
}
134+
135+
cudaStreamDestroy(stream);
136+
MATX_CUDA_CHECK_LAST_ERROR();
137+
MATX_EXIT_HANDLER();
138+
}

0 commit comments

Comments
 (0)